In [1]:
# importing necessary files
import pickle
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

In [2]:
#reading the csv file into a dataframe called "customer_data"
customer_data=pd.read_csv("marketing_campaign.csv", sep="\t")
customer_data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [3]:
# Calculate the mean income
mean_income = customer_data['Income'].mean()

# Replace null values with the mean income
customer_data['Income'].fillna(mean_income, inplace=True)
customer_data.isna().sum()

ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64

In [4]:
customer_data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

In [5]:
# Exclude non-numeric columns from correlation computation
numeric_data = customer_data.select_dtypes(include=['number'])

# Compute correlations
correlations = numeric_data.corr()['Response']
print(correlations)


ID                    -0.021968
Year_Birth             0.021325
Income                 0.132756
Kidhome               -0.080008
Teenhome              -0.154446
Recency               -0.198437
MntWines               0.247254
MntFruits              0.125289
MntMeatProducts        0.236335
MntFishProducts        0.111331
MntSweetProducts       0.117372
MntGoldProds           0.139850
NumDealsPurchases      0.002238
NumWebPurchases        0.148730
NumCatalogPurchases    0.220810
NumStorePurchases      0.039363
NumWebVisitsMonth     -0.003987
AcceptedCmp3           0.254258
AcceptedCmp4           0.177019
AcceptedCmp5           0.326634
AcceptedCmp1           0.293982
AcceptedCmp2           0.169293
Complain              -0.001707
Z_CostContact               NaN
Z_Revenue                   NaN
Response               1.000000
Name: Response, dtype: float64


In [6]:
# lets remove ID, Year_Birth, Z_CostContact, Z_Revenue
customer_data_reduced = customer_data.drop(['ID', 'Education' ,'MntFishProducts', 'MntSweetProducts', 'Complain', 'NumStorePurchases', 'Year_Birth', 'Z_CostContact', 'MntGoldProds', 'Dt_Customer', 'NumWebPurchases', 'NumStorePurchases', 'NumDealsPurchases', 'NumWebVisitsMonth', 'Z_Revenue', 'Dt_Customer', 'Marital_Status', 'Kidhome',
       'Teenhome', 'Recency'], axis=1)

In [7]:
customer_data_reduced.columns

Index(['Income', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'NumCatalogPurchases', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Response'],
      dtype='object')

In [8]:
customer_data_reduced=pd.DataFrame(customer_data_reduced)

In [9]:
customer_data_reduced

Unnamed: 0,Income,MntWines,MntFruits,MntMeatProducts,NumCatalogPurchases,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response
0,58138.0,635,88,546,10,0,0,0,0,0,1
1,46344.0,11,1,6,1,0,0,0,0,0,0
2,71613.0,426,49,127,2,0,0,0,0,0,0
3,26646.0,11,4,20,0,0,0,0,0,0,0
4,58293.0,173,43,118,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2235,61223.0,709,43,182,3,0,0,0,0,0,0
2236,64014.0,406,0,30,2,0,0,0,1,0,0
2237,56981.0,908,48,217,3,0,1,0,0,0,0
2238,69245.0,428,30,214,5,0,0,0,0,0,0


In [10]:
customer_data_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Income               2240 non-null   float64
 1   MntWines             2240 non-null   int64  
 2   MntFruits            2240 non-null   int64  
 3   MntMeatProducts      2240 non-null   int64  
 4   NumCatalogPurchases  2240 non-null   int64  
 5   AcceptedCmp3         2240 non-null   int64  
 6   AcceptedCmp4         2240 non-null   int64  
 7   AcceptedCmp5         2240 non-null   int64  
 8   AcceptedCmp1         2240 non-null   int64  
 9   AcceptedCmp2         2240 non-null   int64  
 10  Response             2240 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 192.6 KB


In [11]:
customer_data_reduced.dtypes

Income                 float64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
NumCatalogPurchases      int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Response                 int64
dtype: object

In [12]:
# #calculating the first quantile
# q1=customer_data_reduced.quantile(0.25)

# #calculating the third quantile
# q3=customer_data_reduced.quantile(0.75)
# print(q1)
# print(q3)

# # Calculate the interquartile range (IQR) for each feature
# iqr=q3-q1
# print(iqr)

In [13]:
# # Remove outliers based on the interquartile range (IQR)
# dataset=customer_data_reduced[~((customer_data_reduced<(q1-1.5*iqr)) | (customer_data_reduced > (q3 + 1.5*iqr))).any(axis=1)]
# dataset

In [14]:
#splitting the data into features and target variable
X=customer_data_reduced.drop(columns=['Response'])
y=customer_data_reduced['Response']

In [15]:
scaler=StandardScaler()

X=scaler.fit_transform(X)

file_path = r'C:\Users\kusha\OneDrive\Desktop\Python_Workspace_Evaluations\Weekly_Assignment_8\ml_deployment\datafiles\scaler_file.pkl'

with open(file_path, 'wb') as f:
    pickle.dump(scaler, f)

print("Scaler saved as Scaler_file.pkl")

Scaler saved as Scaler_file.pkl


In [16]:
#Splitting the data into training and testing data
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
# Initializing the logistic regression model with parameter
logistic_regression_model=LogisticRegression(max_iter=3000)

#training the logistic_regression_model
logistic_regression_model.fit(X_train,y_train)

In [18]:
# predicting the values on test data(X_test) and train data(X_train)
testing_prediction=logistic_regression_model.predict(X_test)
training_prediction=logistic_regression_model.predict(X_train)

In [19]:
print("Accuracy score of our model on testing data: ",accuracy_score(y_test,testing_prediction))

Accuracy score of our model on testing data:  0.8660714285714286


In [20]:
#Classification report of our model
print(classification_report(y_test,testing_prediction))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       380
           1       0.72      0.19      0.30        68

    accuracy                           0.87       448
   macro avg       0.80      0.59      0.61       448
weighted avg       0.85      0.87      0.83       448



In [21]:
# Initializing the support_vector_regressor_model with parameter kernel='linear'
support_vector_classification_model=SVC()

#training the support_vector_regressor_model
support_vector_classification_model.fit(X_train,y_train)

In [22]:
# predicting the values on test data(X_test) and train data(X_train)
testing_prediction_svc_model=support_vector_classification_model.predict(X_test)
training_prediction_svc_model=support_vector_classification_model.predict(X_train)

In [23]:
print("Accuracy score of our model on testing data: ",accuracy_score(y_test,testing_prediction_svc_model))

Accuracy score of our model on testing data:  0.8727678571428571


In [24]:
#Classification report of our model
print(classification_report(y_test,testing_prediction_svc_model))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       380
           1       0.79      0.22      0.34        68

    accuracy                           0.87       448
   macro avg       0.83      0.61      0.64       448
weighted avg       0.86      0.87      0.84       448



In [25]:
# Initializing the logistic regression model
random_forest_classification_model=RandomForestClassifier()

#training the support_vector_regressor_model
random_forest_classification_model.fit(X_train,y_train)

In [26]:
# predicting the values on test data(X_test) and train data(X_train)
testing_prediction_rfc_model=random_forest_classification_model.predict(X_test)
training_prediction_rfc_model=random_forest_classification_model.predict(X_train)

In [27]:
print("Accuracy score of our model on testing data: ",accuracy_score(y_test,testing_prediction_rfc_model))

Accuracy score of our model on testing data:  0.8482142857142857


In [28]:
# Initializing the KNeighborsClassifier model
knn_model=KNeighborsClassifier()

#training the support_vector_regressor_model
knn_model.fit(X_train,y_train)

In [29]:
# predicting the values on test data(X_test) and train data(X_train)
testing_prediction_knn_model=knn_model.predict(X_test)
training_prediction_knn_model=knn_model.predict(X_train)

In [30]:
print("Accuracy score of our model on testing data: ",accuracy_score(y_test,testing_prediction_knn_model))

Accuracy score of our model on testing data:  0.8482142857142857


In [31]:
file_path = r'C:\Users\kusha\OneDrive\Desktop\Python_Workspace_Evaluations\Weekly_Assignment_8\ml_deployment\datafiles\svc_model_file.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(support_vector_classification_model, file)