<a href="https://colab.research.google.com/github/GilenW/CSE151A_FINAL_PROJECT/blob/main/Copy_of_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip3 install pandas
# !pip3 install seaborn
# !pip3 install matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
layoff_filepath = 'layoffs.csv'
layoffdf = pd.read_csv(layoff_filepath)

FileNotFoundError: [Errno 2] No such file or directory: 'layoffs.csv'

**Since the goal of this project is to predict a new target 'Laid off Quarter' based on feature 'date', we decide to add the new target column here before data processing stage. So we can see the potential relationship between target and other features.**


Q1: January, February, March

Q2: April, May, June

Q3: July, August, September

Q4: October, November, December

In [None]:
date_list = layoffdf['date'].values
quarters = []
for date in date_list:
    y_m_d = date.split("-")
    month = int(y_m_d[1])
    if (1<= month <= 3):
        quarter = 1
    elif (4<= month <= 6):
        quarter = 2
    elif (7<= month <= 9):
        quarter = 3
    else:
        quarter = 4
    quarters.append(quarter)
layoffdf['quarters'] = quarters
layoffdf = layoffdf.drop(columns=['date'])

## Data Exploration

In [None]:
layoffdf.head()

In [None]:
layoffdf.shape

In [None]:
layoffdf.describe()

In [None]:
columns = layoffdf.columns
columns

In [None]:
layoffdf.dtypes

In [None]:
layoffdf.isnull().sum()

In [None]:
print("Percentage of missing values:")
print(f"total_laid_off contains {round((1124 / layoffdf.shape[0]) * 100)} % missing values")
print(f"percentage_laid_off contains {round((1172 / layoffdf.shape[0]) * 100)} % missing values")
print(f"funds_raised contains {round((351 / layoffdf.shape[0]) * 100)} % missing values")

## Data Visualizatoin

### Plots for numerical feature: 'total_laid_off'

In [None]:
sns.histplot(layoffdf['total_laid_off'],log_scale=True,kde=True)

In [None]:
layoffdf['total_laid_off'].value_counts()

### Plots for numerical feature: 'percentage_laid_off'

In [None]:
sns.histplot(layoffdf['percentage_laid_off'],kde=True)


In [None]:
layoffdf['percentage_laid_off'].value_counts()

### Plots for numerical feature: 'funds_raised'

In [None]:
funds = list(layoffdf['funds_raised'])
#get rid of NaN for visualization
funds = [x for x in funds if ~np.isnan(x)]
#funds contain minimum number 0, which is unable to use log scale
min(funds)

In [None]:
#log1p adds 1 to all the zero numbers
sns.histplot(np.log1p(funds),kde=True)

In [None]:
layoffdf['funds_raised'].value_counts()

### Correlations

In [None]:
sns.pairplot(layoffdf, hue='quarters')

In [None]:
corr = layoffdf.corr()
sns.heatmap(corr, vmin=-1, vmax=1, center=0, annot=True, cmap= 'RdBu')

### Categorical features

In [None]:
column_names = ['industry', 'stage','location', 'country','company','quarters']
for column_name in column_names:
    value_counts = layoffdf[column_name].value_counts().head(5)

    # Plot a pie chart
    plt.figure(figsize=(10, 6))
    plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90)
    plt.title(f'Top 5 frequent {column_name}')
    plt.show()

    plt.figure(figsize=(10, 6))
    value_counts.plot(kind='bar')
    plt.title(f'Frequency in feature {column_name}')
    plt.ylabel('Count')
    plt.show()
    print(value_counts.index)



In [None]:
column_names = ['industry', 'stage','location', 'country','company']
for column_name in column_names:
    top_list = layoffdf[column_name].value_counts().head(5).index.tolist()
    new_df = layoffdf[layoffdf[column_name].isin(top_list)]

    new_df = pd.crosstab(new_df[column_name], new_df['quarters'])
    plt.figure(figsize=(10, 6))
    sns.heatmap(new_df, annot=True, fmt='d')
    plt.xlabel('Quarter')
    plt.ylabel(column_name)
    plt.show()

# Data processing

### Handle missing values

We replace the missing numerical values using a descriptive statistic mean

In [None]:
from sklearn.impute import SimpleImputer
columns_to_impute = ['total_laid_off', 'percentage_laid_off', 'funds_raised']
imputer = SimpleImputer(strategy='mean')
layoffdf[columns_to_impute] = imputer.fit_transform(layoffdf[columns_to_impute])
layoffdf.isnull().sum()

we drop the rows with missing values for categorical features, since there are only few rows that miss data.

In [None]:
columns_with_missing = ['location', 'industry', 'stage']
layoffdf.dropna(subset=columns_with_missing, inplace=True)


In [None]:
layoffdf.isnull().sum()

### Rescale data

plan to use neural network models


In [None]:
layoffdf.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
columns_to_normalize = ['total_laid_off', 'funds_raised','percentage_laid_off']
layoffdf[columns_to_normalize] = min_max_scaler.fit_transform(layoffdf[columns_to_normalize])


In [None]:
layoffdf.isnull().sum()

In [None]:
layoffdf.isna().sum()


### Transform Categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first',handle_unknown='ignore')
categorical_cols = ['industry', 'stage','location', 'country','company']
encoded_data = encoder.fit_transform(layoffdf[ categorical_cols])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out( categorical_cols))


In [None]:
encoded_df.index = layoffdf.index
layoffdf = layoffdf.drop(columns=categorical_cols)
layoffdf = pd.concat([layoffdf, encoded_df], axis=1)

In [None]:
layoffdf

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
X = layoffdf.drop('quarters', axis=1)
y = layoffdf['quarters']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### First Model: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


model_lg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
model_lg.fit(X_train, y_train)
y_train_pred = model_lg.predict(X_train)
y_test_pred = model_lg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Plotting
plt.bar(['Training error', 'Test error'], [1-train_accuracy, 1-test_accuracy])
plt.ylabel('Error')
plt.title('Training vs. Test Error')
plt.show()



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(12, activation='relu', input_shape=(X_train.shape[1],)))

model.add(Dense(8, activation='relu'))

model.add(Dense(y_train.shape[1], activation='softmax'))