<a href="https://colab.research.google.com/github/Ishita2patel/CIND820-Project/blob/main/Crossvalidation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install memory-profiler

Collecting memory-profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.61.0


In [3]:
#Data processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#Model performance metrics
from time import process_time
from memory_profiler import profile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Feature selection and models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

#Data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
%load_ext memory_profiler

Models

In [4]:
#Random Forest Model
def RFClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Random Forest Classifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  forest_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

Modified Models to include SMOTE and standardization

In [5]:
#Random Forest function
def RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)

  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  forest_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

Crossvalidation Models

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

K-Fold Validation Models

In [7]:
#Random Forest Function
def RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)

  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []

  all_predictions = []
  all_true_labels = []

  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]

    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)

    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]

    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)

    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)

    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)

    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)

    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)

    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)

  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""

  print(summary_report)
  print(time)

Time-Series crossvalidation

In [8]:
#Random Forest Function
def RFClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Specify number of folds (k) for cross validation
  tscv = TimeSeriesSplit(n_splits = num_folds)

  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []

  all_predictions = []
  all_true_labels = []

  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]

    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)

    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]

    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)

    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)

    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)

    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)

    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)

    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)

  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""

  print(summary_report)
  print(time)

Data Import and Processing

In [9]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Control SMOTE data

In [10]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

Filtered Data
Pearson Correlation Filters

In [11]:
#Correlation of Onehot encoded dataset

corr = df_onehot.corr()

revenue_correlation = corr["Revenue"]
sorted_pearson_correlation = revenue_correlation.abs().sort_values(ascending = False)

sorted_pearson_correlation_df = pd.DataFrame(sorted_pearson_correlation)
SPC_topquantile = sorted_pearson_correlation_df.quantile(0.75)
filtered_df = sorted_pearson_correlation_df[sorted_pearson_correlation_df >= SPC_topquantile]
filtered_df.dropna(inplace = True)
#18 attributes were kept, were in the top quantile

df_pearson = df_onehot[filtered_df.index.tolist()]

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])

#Specify independent/ dependent values
X_p = df_pearson.drop(columns = "Revenue")
y_p = df_pearson["Revenue"]

#Split the Data
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size = 0.3)

smote = SMOTE()
X_p_train_smote, y_p_train_smote = smote.fit_resample(X_p_train, y_p_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])


Random Forest Features

In [12]:
#Filters of RF Classifier

rf_classifier = RandomForestClassifier()

#Fit random forest classifier
rf_classifier.fit(X_train_smote, y_train_smote)

feature_importances = rf_classifier.feature_importances_

rf_df = pd.DataFrame({"Feature": X_train_smote.columns, "Importance": feature_importances})

sorted_features = np.argsort(feature_importances)[::-1]

#Sorting features
rf_df_sorted = rf_df.sort_values("Importance", ascending = False)
rf_df_sorted = rf_df_sorted.reset_index(drop = True)
rf_df_sorted

rf_df_sorted.describe()
RF_topquantile = rf_df_sorted['Importance'].quantile(0.75)
RFfiltered_df = rf_df_sorted.loc[rf_df_sorted['Importance'] >= RF_topquantile]
#19 features were kept after keeping the top quartile of results
filtered_attributes_rf = RFfiltered_df.index.tolist()
df_rf = df_onehot[RFfiltered_df["Feature"]]

df_rf["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rf = df_rf.drop(columns = "Revenue")
y_rf = df_rf["Revenue"]

#Split the Data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size = 0.3)

smote = SMOTE()
X_rf_train_smote, y_rf_train_smote = smote.fit_resample(X_rf_train, y_rf_train)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rf["Revenue"] = df_onehot["Revenue"]


In [13]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

p_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration','BounceRates',
       'ExitRates', 'PageValues', 'SpecialDay']

rf_numeric_features = ['Administrative', 'Administrative_Duration',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

cat_features = [col for col in X_train if col not in numeric_features]
p_cat_features = [col for col in X_p_train if col not in numeric_features]
rf_cat_features = [col for col in X_rf_train if col not in numeric_features]

Modified Functions to include oversampling/scaling within the function

Unfiltered Data

In [14]:
%memit RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_features, cat_features)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3152
           1       0.68      0.66      0.67       547

    accuracy                           0.90      3699
   macro avg       0.81      0.80      0.81      3699
weighted avg       0.90      0.90      0.90      3699

[[2983  169]
 [ 187  360]]
2.458183323
peak memory: 320.88 MiB, increment: 38.18 MiB


Pearson Correlation Filtered Data

In [15]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3112
           1       0.65      0.70      0.67       587

    accuracy                           0.89      3699
   macro avg       0.79      0.82      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2886  226]
 [ 174  413]]
3.3424328259999996
peak memory: 320.89 MiB, increment: 0.02 MiB


Random Forest Features

In [16]:
%memit RFClassifierMOD(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features)

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      3152
           1       0.66      0.75      0.70       547

    accuracy                           0.91      3699
   macro avg       0.81      0.84      0.82      3699
weighted avg       0.91      0.91      0.91      3699

[[2939  213]
 [ 136  411]]
2.575466091999999
peak memory: 320.91 MiB, increment: 0.02 MiB


K-Fold Crossvalidation

Unfiltered Data

In [17]:
%memit RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7270
           1       0.78      0.50      0.61      1361

    accuracy                           0.90      8631
   macro avg       0.85      0.74      0.78      8631
weighted avg       0.89      0.90      0.89      8631

11.844049472999998
peak memory: 346.26 MiB, increment: 25.35 MiB


Pearson correlation features

In [18]:
%memit RFClassifierKFold(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7310
           1       0.72      0.56      0.63      1321

    accuracy                           0.90      8631
   macro avg       0.82      0.76      0.78      8631
weighted avg       0.89      0.90      0.89      8631

11.878873278999997
peak memory: 328.16 MiB, increment: 1.94 MiB


Random forest features

In [19]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7270
           1       0.75      0.54      0.63      1361

    accuracy                           0.90      8631
   macro avg       0.83      0.75      0.79      8631
weighted avg       0.89      0.90      0.89      8631

11.857269144
peak memory: 327.20 MiB, increment: 0.95 MiB


Time-Series Crossvalidation
Unfiltered Data

In [20]:
%memit RFClassifierTS(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6077
           1       0.77      0.50      0.61      1113

    accuracy                           0.90      7190
   macro avg       0.84      0.74      0.78      7190
weighted avg       0.89      0.90      0.89      7190

7.776894879999993
peak memory: 342.71 MiB, increment: 15.51 MiB


Pearson correlation data

In [21]:
%memit RFClassifierTS(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6081
           1       0.71      0.56      0.63      1109

    accuracy                           0.90      7190
   macro avg       0.82      0.76      0.78      7190
weighted avg       0.89      0.90      0.89      7190

7.517051772000002
peak memory: 342.76 MiB, increment: 0.05 MiB


Random Forest features

In [22]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.96      0.94      6036
           1       0.73      0.51      0.61      1154

    accuracy                           0.89      7190
   macro avg       0.82      0.74      0.77      7190
weighted avg       0.88      0.89      0.88      7190

7.270151698999996
peak memory: 342.78 MiB, increment: 0.02 MiB
