# Here there is Only the Cleaning of the Model

In [1]:
# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Importing the Machine Learning models we require from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


# Importing the other functions we may require from Scikit-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

# To get diferent metric scores
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,plot_confusion_matrix,precision_recall_curve,roc_curve,make_scorer

# Code to ignore warnings from function usage
import warnings;
import numpy as np
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import KNNImputer


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
df1_test = pd.read_csv('Surveydata_test_cleaned_v2.csv')
df2_test = pd.read_csv('Traveldata_test_cleaned_v2.csv')

df_union_test = pd.merge(df1_test, df2_test, on = 'ID')
del df_union_test['Unnamed: 0_x']
del df_union_test['Unnamed: 0_y']

df_union_test.shape



(35602, 24)

In [3]:
df1_train = pd.read_csv('Surveydata_train_cleaned_v2.csv')
df2_train = pd.read_csv('Traveldata_train_cleaned_v2.csv')

df_union_train = pd.merge(df1_train, df2_train, on = 'ID')
del df_union_train['Unnamed: 0_x']
del df_union_train['Unnamed: 0_y']

df_union_train.shape

(94379, 25)

In [4]:
def accuracy(y_train_test, y_pred_train_test):
    cm = confusion_matrix(y_train_test, y_pred_train_test)
    tot = sum(sum(cm))
    zero = cm[0][0]
    one = cm[1][1]

    accuracy = (one + zero)/tot
    return accuracy

In [5]:
X_train = df_union_train.drop(['ID',"Overall_Experience"], axis=1) # keep it as a Matrix
y_train = df_union_train["Overall_Experience"]  # keep it as an array

X_train.shape

(94379, 23)

In [6]:
X_test = df_union_test.drop(['ID'], axis=1) # keep it as a Matrix

X_test.shape

(35602, 23)

 SPLIT

In [7]:
X_train_train ,X_train_test , y_train_train, y_train_test = train_test_split(X_train , y_train, test_size=0.3, random_state=5, stratify= y_train)

In [8]:
numeric_features_min_max = ['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering','Platform_Location', 'Onboard_Wifi_Service', 
                            'Onboard_Entertainment','Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service','Legroom', 'Baggage_Handling', 
                            'CheckIn_Service', 'Cleanliness','Online_Boarding', 'Gender', 'Customer_Type', 'Age', 'Type_Travel','Travel_Class']

numeric_features_standard = [   'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins']


numeric_transformer = Pipeline(
    steps = [ ('scaler1', MinMaxScaler() ), ('scaler2', StandardScaler() ), ('imputer', KNNImputer(n_neighbors=4) ) ] 
    # ('imputer', SimpleImputer(strategy='median') ), ADD LATER ON, , ('imputer',  IterativeImputer(random_state=0, initial_strategy='median') ('imputer', KNNImputer(n_neighbors=4) )
)

# Here the pre-processor step
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric1", numeric_transformer, numeric_features_min_max ),
        ("numeric2", numeric_transformer, numeric_features_standard )
    ]
)

# here the Pipeline
pipe_rf = Pipeline(
    steps = [('preprocessor', preprocessor), ('regressor', RandomForestClassifier(n_estimators = 100, n_jobs=-1) ) ]#n_estimators =1000 5min try 2k
)


#pipe_rf.fit(X_train_train, y_train_train)

#y_pred_train_train = pipe_rf.predict(X_train_train)

#metrics_score(y_train_train, y_pred_train_train)


In [None]:
pipe_rf.fit(X_train_train, y_train_train)

y_pred_train_test = pipe_rf.predict(X_train_test)

#metrics_score(y_train_test, y_pred_train_test)
accuracy(y_train_test, y_pred_train_test)