In [2]:
# logistic regression.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv("cust_satisfaction.csv")
data.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,satisfaction,Age,Flight Distance,Inflight entertainment,Baggage handling,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,Male,Loyal Customer,Personal Travel,Eco Plus,neutral or dissatisfied,13,460,5,4,5,25,18.0
1,Male,disloyal Customer,Business travel,Business,neutral or dissatisfied,25,235,1,3,1,1,6.0
2,Female,Loyal Customer,Business travel,Business,satisfied,26,1142,5,4,5,0,0.0
3,Female,Loyal Customer,Business travel,Business,neutral or dissatisfied,25,562,2,3,2,11,9.0
4,Male,Loyal Customer,Business travel,Business,satisfied,61,214,3,4,3,0,0.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      103904 non-null  object 
 1   Customer Type               103904 non-null  object 
 2   Type of Travel              103904 non-null  object 
 3   Class                       103904 non-null  object 
 4   satisfaction                103904 non-null  object 
 5   Age                         103904 non-null  int64  
 6   Flight Distance             103904 non-null  int64  
 7   Inflight entertainment      103904 non-null  int64  
 8   Baggage handling            103904 non-null  int64  
 9   Cleanliness                 103904 non-null  int64  
 10  Departure Delay in Minutes  103904 non-null  int64  
 11  Arrival Delay in Minutes    103594 non-null  float64
dtypes: float64(1), int64(6), object(5)
memory usage: 9.5+ MB


In [5]:
data.isna().sum()

Gender                          0
Customer Type                   0
Type of Travel                  0
Class                           0
satisfaction                    0
Age                             0
Flight Distance                 0
Inflight entertainment          0
Baggage handling                0
Cleanliness                     0
Departure Delay in Minutes      0
Arrival Delay in Minutes      310
dtype: int64

In [6]:
data["Arrival Delay in Minutes"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Arrival Delay in Minutes"].fillna(0, inplace=True)


In [7]:
data = data.drop_duplicates()

In [8]:
data.isna().sum()

Gender                        0
Customer Type                 0
Type of Travel                0
Class                         0
satisfaction                  0
Age                           0
Flight Distance               0
Inflight entertainment        0
Baggage handling              0
Cleanliness                   0
Departure Delay in Minutes    0
Arrival Delay in Minutes      0
dtype: int64

In [9]:
loyal_customers = data[data["Customer Type"] == "Loyal Customer"]
disloyal_customers =data[data["Customer Type"] == "disloyal Customer"]

In [10]:
loyal_customers = loyal_customers.sample(20000,random_state=42)
balance_pf=pd.concat([loyal_customers,disloyal_customers], axis=0)
balance_pf.reset_index(drop=True,inplace=True)
balance_pf.shape

(38954, 12)

In [11]:
numerical_features = data.select_dtypes(exclude='object')
categorical_features = data.select_dtypes(include='object')
categorical_features.info()


<class 'pandas.core.frame.DataFrame'>
Index: 103732 entries, 0 to 103903
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Gender          103732 non-null  object
 1   Customer Type   103732 non-null  object
 2   Type of Travel  103732 non-null  object
 3   Class           103732 non-null  object
 4   satisfaction    103732 non-null  object
dtypes: object(5)
memory usage: 4.7+ MB


In [12]:
## encoding using LabelEncoder and oneHotEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
ohe = OneHotEncoder(drop="if_binary")
cat_col_encoded = ohe.fit_transform(categorical_features).toarray()

In [13]:
cat_col_encoded = pd.DataFrame(cat_col_encoded, columns=list(ohe.get_feature_names_out(categorical_features.columns)))
cat_col_encoded = cat_col_encoded.reset_index(drop=True)
numerical_features = numerical_features.reset_index(drop=True)
final_data = pd.concat([cat_col_encoded, numerical_features], axis=1)
final_data.to_csv("encoded_data.csv", index=False)
final_data.head()

Unnamed: 0,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_satisfied,Age,Flight Distance,Inflight entertainment,Baggage handling,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,13,460,5,4,5,25,18.0
1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,25,235,1,3,1,1,6.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,26,1142,5,4,5,0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,25,562,2,3,2,11,9.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,61,214,3,4,3,0,0.0


In [14]:
x = final_data.drop(columns=["Customer Type_disloyal Customer"])
y = final_data["Customer Type_disloyal Customer"]
# x = final_data.drop(columns=["Customer Type"])
# y = final_data["Customer Type"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# ## Logistic Regression
model= LogisticRegression(max_iter=10000)
model.fit(x_train, y_train)
# saving the trained model
import pickle
pickle.dump(model, open("model.pkl", "wb"))


In [16]:
model.score(x_train, y_train)
# model.score(x_test, y_test)


0.8992468518406941