# Introduction to Data Science

Authors: Lior Tondovski, Ilan Vasilevski, Maya Vilenko

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.utils import resample
# import smote
from imblearn.over_sampling import SMOTE
#inport stadard scaler
from sklearn.preprocessing import StandardScaler

from utils import *
from Config import *

In [None]:
#read the data
train_data = pd.read_csv(training_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
#data shape
print(f'the shape of the train_data is {train_data.shape}')
print(f'the shape of the test_data is {test_data.shape}')
print(f'the ratio between the train and test data is {round(test_data.shape[0]/train_data.shape[0], 2)}')

##### EDA

In [None]:
train_data.head(3)

In [None]:
test_data.head(3)

In [None]:
#check unique values in each column
train_data.nunique()

it can be seen for example that resolution column has a single value so it doesnt add any information and we will remove it in the preprocess phase

In [None]:
#Data types
train_data.info()

In [None]:
#plot bat chart of clicked ratio per state
#first we need to group the data by state and count the number of clicks
state_clicks = train_data.groupby('state')['clicked'].mean().reset_index()
state_clicks = state_clicks.sort_values(by='clicked', ascending=False)

#plot the data
plt.figure(figsize=(15, 8))
sns.barplot(x='state', y='clicked', data=state_clicks)
plt.title('Clicked ratio per state', fontsize=20)
plt.xlabel('State', fontsize=15)
plt.ylabel('Clicked ratio', fontsize=15)
#rotate the x labels
plt.xticks(rotation=90)
plt.show()


There is not much variation in the click ratio between the countries

In [None]:
#plot click ratio per app category
#first we need to group the data by app category and count the number of clicks
app_clicks = train_data.groupby('app_cat')['clicked'].mean().reset_index()
app_clicks = app_clicks.sort_values(by='clicked', ascending=False)

#plot the data
plt.figure(figsize=(15, 8))
sns.barplot(x='app_cat', y='clicked', data=app_clicks)
plt.title('Clicked ratio per app category', fontsize=20)
plt.xlabel('App category', fontsize=15)
plt.ylabel('Clicked ratio', fontsize=15)
#rotate the x labels
plt.xticks(rotation=90)
plt.show()


Graph shows that click ratios are pretty varied between categories of applications

In [None]:
#plot click ratio per the location of the add
#first we need to group the data by location and count the number of clicks
location_clicks = train_data.groupby('banner_pos')['clicked'].mean().reset_index()
location_clicks = location_clicks.sort_values(by='clicked', ascending=False)

#plot the data
plt.figure(figsize=(15, 8))
sns.barplot(x='banner_pos', y='clicked', data=location_clicks)
plt.title('Clicked ratio per location', fontsize=20)
plt.xlabel('Location', fontsize=15)
plt.ylabel('Clicked ratio', fontsize=15)
#rotate the x labels
plt.xticks(rotation=90)
plt.show()


According to this graph, adds on the bottom/right of the screen are less likely to be clicked!

##### Missing Values

In [None]:
train_data_missing_values = check_missing_values(train_data)
test_data_missing_values = check_missing_values(test_data)

In [None]:
#check the missing values in the train data
train_data_missing_values[train_data_missing_values['missing_values_percentage']>0]

In [None]:
#check the missing values in the test data
test_data_missing_values[test_data_missing_values['missing_values_percentage']>0]

In [None]:
#replace missing values with the category 'unknown'
#the reason that we replace the missing values with the category 'unknown' is because we want to keep the information that the value is missing
train_data = replace_missing_values(train_data)
test_data = replace_missing_values(test_data)

##### Preprocess & Feature Extraction

In [None]:
# one hot encoding for app_category, banner_pos, device_version, state
get_dummis = ['state','manufacturer', 'app_cat', 'banner_pos', 'device_version']
train_data = pd.get_dummies(train_data, columns=get_dummis, prefix=get_dummis)
test_data = pd.get_dummies(test_data, columns=get_dummis, prefix=get_dummis)

#op_id colums assigns unique id to each row and it is not useful for our model
train_data.drop(columns=['op_id'], inplace=True)
test_data.drop(columns=['op_id'], inplace=True)

#drop app_id column due to the high cardinality
train_data.drop(columns=['app_id'], inplace=True)
test_data.drop(columns=['app_id'], inplace=True)

#drop resolution column due single value
train_data.drop(columns=['resolution'], inplace=True)
test_data.drop(columns=['resolution'], inplace=True)

#change the target variable to binary feature
train_data['clicked'] = train_data['clicked'].apply(lambda x: 1 if x == True else 0)

In [None]:
#Extract new features from the categorical features and drop the original features
#The features are the ratio of the number of clicks for each category in each feature
categorical_columns = ['user_isp', 'device_model']
new_feature_names = ['user_isp_ratio_clicked', 'device_mode_ratio_clicked']
train_data, test_data = check_the_ratio_of_clicked_from_feature(train_data, test_data, categorical_columns, new_feature_names)
#drop the original features
train_data.drop(columns=['user_isp', 'device_model'], inplace=True)
test_data.drop(columns=['user_isp', 'device_model'], inplace=True)

In [None]:
#new features from the timestamp - day, month, year, hour
#chenge timestamp to datetime
train_data['timestamp'] = pd.to_datetime(train_data['timestamp'])
test_data['timestamp'] = pd.to_datetime(test_data['timestamp'])
#extract hour, day, month, year from timestamp
train_data['hour'] = train_data['timestamp'].dt.hour
train_data['day'] = train_data['timestamp'].dt.day
train_data['month'] = train_data['timestamp'].dt.month
train_data['year'] = train_data['timestamp'].dt.year

test_data['hour'] = test_data['timestamp'].dt.hour
test_data['day'] = test_data['timestamp'].dt.day
test_data['month'] = test_data['timestamp'].dt.month
test_data['year'] = test_data['timestamp'].dt.year
#drop timestamp
train_data.drop(columns=['timestamp'], inplace=True)
test_data.drop(columns=['timestamp'], inplace=True)

##### Target Varible Distributin - Check if the Data is Balanced or Not?

In [None]:
#check if the data is balanced
print(f'number of negative samples {train_data[train_data.clicked == 0].shape[0]}')
print(f'number of positive samples {train_data[train_data.clicked == 1].shape[0]}')
print(f'The ratio of negative samples to positive samples is {(train_data[train_data.clicked == 0].shape[0] / train_data.shape[0])*100:.2f}%')
print(f'The ratio of positive samples to negative samples is {(train_data[train_data.clicked == 1].shape[0] / train_data.shape[0])*100:.2f}%')

In [None]:
#plot the distribution of the target variable
sns.countplot(x='clicked', data=train_data)
plt.legend(['Not clicked', 'Clicked'])

This dataset is completely unbalanced, as can be seen!

##### correlation Analysis

In [None]:
#Because the there are many features, we will plot only the features with the highest correlation with the target variable
#find the correlation between the features and the target variable
correlation = train_data.corr()
correlation = correlation.sort_values(by='clicked', ascending=False)
#most correlated features with the target variable
plt.figure(figsize=(15, 8))
sns.heatmap(correlation.head(10)[list(correlation.head(10).index.values)], annot=True, cmap='coolwarm')
plt.title('Correlation matrix of the features with the highest correlation with the target variable', fontsize=20)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Features', fontsize=15)
plt.show()

In [None]:
#remove highly correlated features
features=[x for x in train_data.columns if x not in ['clicked']]

to_drop = drop_correlated_features(corr_threshold=0.9, data=train_data[features])

train_data.drop(columns=to_drop, inplace=True)
test_data.drop(columns=to_drop, inplace=True)

##### Standartization Before SMOTE

In [None]:
#standardize the numerical features
columns_to_standardize = [x for x in train_data.columns if x not in binary_columns]
#standardize the train data with standard scaler
scaler = StandardScaler()
train_data[columns_to_standardize] = scaler.fit_transform(train_data[columns_to_standardize])
#standardize the test data with standard scaler
test_data[columns_to_standardize] = scaler.transform(test_data[columns_to_standardize])


##### Undersampling

In [None]:
#undersample the data with sklearn
# Separate majority and minority classes
train_data_majority = train_data[train_data.clicked==0]
train_data_minority = train_data[train_data.clicked==1]

# Downsample majority class
df_majority_downsampled = resample(train_data_majority,
                                    replace=False,    # sample without replacement
                                    n_samples=train_data_minority.shape[0]*5,     # The majority class will be downsampled to the size of the minority class * 5
                                    random_state=12) # reproducible results

# Combine minority class with downsampled majority class
train_data_under_sampled = pd.concat([df_majority_downsampled, train_data_minority])
# Display new class counts
print(train_data_under_sampled.clicked.value_counts())


##### SMOTE

In [None]:
#apply smote on train_data_under_sampled
sm = SMOTE(random_state=12, sampling_strategy=0.5)
X_train_sm, y_train_sm = sm.fit_resample(train_data_under_sampled.drop(columns=['clicked']), train_data_under_sampled.clicked)
#convert the data to pandas dataframe
X_train_sm = pd.DataFrame(X_train_sm, columns=train_data_under_sampled.drop(columns=['clicked']).columns)
y_train_sm = pd.DataFrame(y_train_sm, columns=['clicked'])

#combine the data
train_data_under_sampled_sm = pd.concat([X_train_sm, y_train_sm], axis=1)

#distribution of the target variable
train_data_under_sampled_sm['clicked'].value_counts()

##### Save Train and Test Sets (After Undersampling & With or Without Smote Oversampling)

In [None]:
#save the train and test data into pickle files
train_data_under_sampled.to_pickle('processed_train_data_undersampled.pkl')
train_data_under_sampled_sm.to_pickle('processed_train_data_undersampled_sm.pkl')
test_data.to_pickle('processed_test_data.pkl')