In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.svm import SVC ## 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ManonYa09/Statistics_with_Python_G7/main/Dataset/YdGwfiz4Tp2RsH4s-E6d5g_fe6fe3c8cf0d49028b6706bf33f91df1_Invistico_Airline.csv")

In [3]:
df = df.sample(40000)

In [4]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer

In [5]:
yes_no_columns = []
num_columns = []
cat_columns = []
target = 'satisfaction'
# columns_to_drop = []
for column in df.columns:
    if column == target:
        continue
    if df[column].nunique()==2:
       yes_no_columns.append(column)
    elif df[column].nunique()<=3:
        cat_columns.append(column)
    else:
        num_columns.append(column)

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())                
])

In [7]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing with mode
    ('encoder', OneHotEncoder())    # One-hot encode categories
]) 

In [8]:
yes_no_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing with mode
    ('odinal', OrdinalEncoder())    # One-hot encode categories
]) 

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_columns),
    ('cat', categorical_transformer, cat_columns),
    ('yes_no', yes_no_transformer,yes_no_columns)
])

In [10]:
traning_pipeline_logistics = Pipeline([
    
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

In [11]:
x , y = df.drop(columns=target), df[target]

In [12]:
x_train, x_test, y_train , y_test = train_test_split(x, y)

In [13]:
traning_pipeline_logistics.fit(x_train, y_train)

In [14]:
y_prediction_logistics = traning_pipeline_logistics.predict(x_test)

In [15]:
traning_pipeline_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC())
])

In [16]:
traning_pipeline_SVC.fit(x_train, y_train)

In [17]:
y_prediction_svm = traning_pipeline_SVC.predict(x_test)

In [18]:
y_prediction_svm

array(['satisfied', 'satisfied', 'dissatisfied', ..., 'satisfied',
       'satisfied', 'satisfied'], dtype=object)

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
traning_pipeline_DecisionTree = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier())
])

In [21]:
traning_pipeline_DecisionTree.fit(x_train, y_train)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
traning_pipeline_Forest = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

In [24]:
traning_pipeline_Forest.fit(x_train, y_train)

In [25]:
y_prediction_Forest  = traning_pipeline_Forest.predict(x_test)

In [26]:
param_grid = {
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
      # Including multiple kerne

In [27]:
from sklearn.model_selection import GridSearchCV 

In [28]:
traning_pipeline_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC())
])

In [29]:
# Perform GridSearchCV
# model_svc = SVC()
grid = GridSearchCV(estimator=traning_pipeline_SVC, param_grid=param_grid, scoring='accuracy')


In [30]:
grid.fit(x_train, y_train)

In [31]:
grid.best_params_

{'model__kernel': 'rbf'}

In [32]:
y_prediction_svm = grid.predict(x_test)

In [33]:
from sklearn.metrics import confusion_matrix

In [34]:
RandomForestClassifier()

In [35]:
confusion_matrix(y_test, y_prediction_Forest)

array([[4320,  255],
       [ 307, 5118]])

In [36]:
confusion_matrix(y_test, y_prediction_svm)

array([[4235,  340],
       [ 361, 5064]])

In [37]:
confusion_matrix(y_test, traning_pipeline_DecisionTree.predict(x_test))

array([[4160,  415],
       [ 400, 5025]])

In [38]:
confusion_matrix(y_test, y_prediction_logistics)

array([[3653,  922],
       [ 843, 4582]])

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

In [40]:
traning_pipeline_Boosting = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier())
])

In [41]:
traning_pipeline_Boosting.fit(x_train, y_train)

In [42]:
y_prediction_boosting = traning_pipeline_Boosting.predict(x_test)

In [43]:
confusion_matrix(y_test, y_prediction_boosting)

array([[4148,  427],
       [ 420, 5005]])