Import neccessary libraries

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report,r2_score


Read data

In [8]:
jpn_data = pd.read_excel('JPN DATA.xlsx')
ind_data = pd.read_excel('IN_data.xlsx')

Look for Null values and remove row corresponding to Null value

In [10]:
jpn_data.isnull().sum()

ID            0
CURR_AGE      0
GENDER        0
ANN_INCOME    0
AGE_CAR       0
PURCHASE      0
dtype: int64

In [13]:
ind_data.isnull().sum()

ID            0
CURR_AGE      0
GENDER        0
ANN_INCOME    0
DT_MAINT      0
dtype: int64

No null value value found in both files

Create a new column in IND_Data.xlsx equivalent to have Car_age

In [15]:
reference_date = datetime(2019, 7, 1)
ind_data['DT_MAINT'] = pd.to_datetime(ind_data['DT_MAINT'])
ind_data['AGE_CAR'] = (reference_date - ind_data['DT_MAINT']).dt.days

In [6]:
# ind_data['ANN_INCOME']=ind_data['ANN_INCOME']/1.5

Create a new column in JPN Data.xlsx and Ind_Data.xlsx to contain category details

In [17]:
def categorize_age_car(age):
    if age < 200:
        return '<200'
    elif 200 <= age < 360:
        return '200-360'
    elif 360 <= age < 500:
        return '360-500'
    else:
        return '>500'

ind_data['AGE_CAR_CATEGORY'] = ind_data['AGE_CAR'].apply(categorize_age_car)
jpn_data['AGE_CAR_CATEGORY'] = jpn_data['AGE_CAR'].apply(categorize_age_car)

Encode categorical variables

In [19]:
ind_data['GENDER'] = LabelEncoder().fit_transform(ind_data['GENDER'])
ind_data = pd.get_dummies(ind_data, columns=['AGE_CAR_CATEGORY'], drop_first=True)

jpn_data['GENDER'] = LabelEncoder().fit_transform(jpn_data['GENDER'])
jpn_data = pd.get_dummies(jpn_data, columns=['AGE_CAR_CATEGORY'], drop_first=True)

Merge Japanese and Indian data on common columns for training

In [21]:
common_cols = ['CURR_AGE', 'GENDER', 'ANN_INCOME', 'AGE_CAR'] + \
              [col for col in ind_data.columns if col.startswith('AGE_CAR_CATEGORY_')]
merged_data = pd.concat([jpn_data[common_cols + ['PURCHASE']], ind_data[common_cols]], ignore_index=True)


Features are 
Curr_age: Current age of customer
Gender: Gender of customer
ANN_INCOME: annual income of customer
AGE_CAR: AGE of car

Target: 
Purchase: 1 if purchased else 0

Split data into features and target

In [23]:
X = merged_data.drop(columns=['PURCHASE'])
y = merged_data['PURCHASE'].dropna()

Handle missing values in X

In [25]:
X = X[X.index.isin(y.index)]

Split data into training and test sets

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Create models

In [40]:
model1=LogisticRegression()
model2=RandomForestClassifier()
model3=CategoricalNB()


Fit the models

In [42]:
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Make predictions using x_test and y_test

In [44]:
log_pred=model1.predict(x_test)
ran_pred=model2.predict(x_test)
cat_pred=model3.predict(x_test)
x_test.shape

(8000, 7)

Evaluate metrics for each model 

In [46]:
log_report=classification_report(y_test, log_pred)
ran_report=classification_report(y_test, ran_pred)
cat_report=classification_report(y_test, cat_pred)

In [48]:
print("LogisticRegression: ")
print("accuracy_score: ",accuracy_score(y_test,log_pred))
print("r2_score: ",r2_score(y_test,log_pred))
#print(log_report)
print("RandomForestClassifier: ")
print("accuracy_score: ",accuracy_score(y_test,ran_pred))
print("r2_score: ",r2_score(y_test,ran_pred))
#print(ran_report)
print("Categorical : ")
print("accuracy_score: ",accuracy_score(y_test,cat_pred))
print("r2_score: ",r2_score(y_test,cat_pred))
#print(cat_report)

LogisticRegression: 
accuracy_score:  0.687
r2_score:  -0.28606471964052327
RandomForestClassifier: 
accuracy_score:  0.676625
r2_score:  -0.32869386170528503
Categorical : 
accuracy_score:  0.68575
r2_score:  -0.2912007608531453


Since Logistic Regression gives highest accuracy and r2_score, it is used for predicting indian data

In [50]:
model=model1

Predict for Indian customers

In [52]:
ind_data_features = ind_data[common_cols]
ind_data['PURCHASE_PREDICTION'] = model.predict(ind_data_features)


Count the number of predicted purchases

In [54]:
num_purchases = ind_data['PURCHASE_PREDICTION'].sum()


Check if the number of purchases exceeds 10,000 and display if it does

In [56]:
print(f'Number of predicted purchases: {num_purchases}')

Number of predicted purchases: 66960.0


Save the predictions to a new Excel file

In [58]:
ind_data.to_excel('IND_data_with_predictions.xlsx', index=False)