In [1]:
!pip install keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# DATA CLEANING AND PREPROCESSING

In [2]:
data = pd.read_csv('randomdata.csv')

# #Data Exploration

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Customer Name,Customer_Address,Company Name,Claim Reason,Data confidentiality,Claim Amount,Category Premium,Premium/Amount Ratio,Claim Request output,BMI,Churn
0,0,Christine Payne,"7627 Anderson Rest Apt. 265,Lake Heather, DC 3...","Williams, Henderson and Perez",Travel,Low,377,4794,0.07864,No,21,Yes
1,1,Tony Fernandez,"3953 Cindy Brook Apt. 147,East Lindatown, TN 4...",Moore-Goodwin,Medical,High,1440,14390,0.100069,No,24,Yes
2,2,Christopher Kim,"8693 Walters Mountains,South Tony, TX 88407",Smith-Holmes,Phone,Medium,256,1875,0.136533,No,18,Yes
3,3,Nicole Allen,"56926 Webster Coves,Shawnmouth, NV 04853",Harrell-Perez,Phone,Medium,233,1875,0.124267,No,24,Yes
4,4,Linda Cruz,"489 Thomas Forges Apt. 305,Jesseton, GA 36765","Simpson, Kramer and Hughes",Phone,Medium,239,1875,0.127467,No,21,Yes


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            200000 non-null  int64  
 1   Customer Name         200000 non-null  object 
 2   Customer_Address      200000 non-null  object 
 3   Company Name          200000 non-null  object 
 4   Claim Reason          200000 non-null  object 
 5   Data confidentiality  200000 non-null  object 
 6   Claim Amount          200000 non-null  int64  
 7   Category Premium      200000 non-null  int64  
 8   Premium/Amount Ratio  200000 non-null  float64
 9   Claim Request output  200000 non-null  object 
 10  BMI                   200000 non-null  int64  
 11  Churn                 200000 non-null  object 
dtypes: float64(1), int64(4), object(7)
memory usage: 18.3+ MB


In [5]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Claim Amount,Category Premium,Premium/Amount Ratio,BMI
count,200000.0,200000.0,200000.0,200000.0,200000.0
mean,99999.5,1120.47884,8963.783895,0.125024,23.007205
std,57735.171256,796.660796,6114.737202,0.034742,3.164976
min,0.0,1.0,399.0,0.002506,18.0
25%,49999.75,245.0,1875.0,0.106741,20.0
50%,99999.5,1390.0,14390.0,0.125122,23.0
75%,149999.25,1844.0,14390.0,0.143155,26.0
max,199999.0,2299.0,14390.0,0.24812,28.0


# # Handling Missing Data

In [6]:
# Step 1: Remove Duplicate Rows
data.drop_duplicates(inplace=True)

In [7]:
# Step 2: Remove Irrelevant Columns
# Identify and drop columns that are not relevant for churn prediction
irrelevant_columns = ['Customer Name', 'Customer_Address', 'Company Name', 'Data confidentiality', 'Claim Amount', 'Category Premium', 'Premium/Amount Ratio']
data.drop(columns=irrelevant_columns, inplace=True)



In [8]:
# Step 3: Data Preprocessing
# After removing duplicates and irrelevant columns, you may proceed with data preprocessing
# This may include handling missing data, encoding categorical variables, and scaling/normalizing numerical features

# Handling Missing Data
data.dropna(subset=['Churn'], inplace=True)

In [9]:
# Encoding Categorical Variables
# Identify categorical columns in your dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Load your dataset
data = pd.read_csv('randomdata.csv')
# Identify categorical columns in your dataset
categorical_columns = ['Company Name', 'Claim Reason', 'Category Premium']
# Perform label encoding for categorical columns
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
# Display the resulting DataFrame
print(data)

        Unnamed: 0    Customer Name  \
0                0  Christine Payne   
1                1   Tony Fernandez   
2                2  Christopher Kim   
3                3     Nicole Allen   
4                4       Linda Cruz   
...            ...              ...   
199995      199995  Matthew Estrada   
199996      199996       James Bean   
199997      199997      David Meyer   
199998      199998     Martha Stone   
199999      199999    Shannon Lewis   

                                         Customer_Address  Company Name  \
0       7627 Anderson Rest Apt. 265,Lake Heather, DC 3...        122584   
1       3953 Cindy Brook Apt. 147,East Lindatown, TN 4...         77347   
2             8693 Walters Mountains,South Tony, TX 88407        106968   
3                56926 Webster Coves,Shawnmouth, NV 04853         44952   
4           489 Thomas Forges Apt. 305,Jesseton, GA 36765        104639   
...                                                   ...           ...   
199995

In [10]:
#Scaling Numerical Features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['Claim Amount', 'Premium/Amount Ratio', 'BMI']] = scaler.fit_transform(data[['Claim Amount', 'Premium/Amount Ratio', 'BMI']])

In [11]:
# Data Splitting
import pandas as pd
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, data['Churn'], test_size=0.2, random_state=42)
# Print the shape of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (160000, 12)
Testing set shape: (40000, 12)


In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Load your dataset

data = pd.read_csv('randomdata.csv')
# Identify numerical columns in your dataset
numerical_columns = ['Claim Amount', 'Premium/Amount Ratio', 'BMI']
# Create a MinMaxScaler object
scaler = MinMaxScaler()
# Fit the MinMaxScaler to the numerical columns
scaler.fit(data[numerical_columns])
# Transform the numerical columns using the MinMaxScaler
data[numerical_columns] = scaler.transform(data[numerical_columns])
# Display the resulting DataFrame
print(data)

        Unnamed: 0    Customer Name  \
0                0  Christine Payne   
1                1   Tony Fernandez   
2                2  Christopher Kim   
3                3     Nicole Allen   
4                4       Linda Cruz   
...            ...              ...   
199995      199995  Matthew Estrada   
199996      199996       James Bean   
199997      199997      David Meyer   
199998      199998     Martha Stone   
199999      199999    Shannon Lewis   

                                         Customer_Address  \
0       7627 Anderson Rest Apt. 265,Lake Heather, DC 3...   
1       3953 Cindy Brook Apt. 147,East Lindatown, TN 4...   
2             8693 Walters Mountains,South Tony, TX 88407   
3                56926 Webster Coves,Shawnmouth, NV 04853   
4           489 Thomas Forges Apt. 305,Jesseton, GA 36765   
...                                                   ...   
199995       2024 Lopez Gateway,Lake Pamelafort, MS 35772   
199996             0268 Lori Falls,West Jef

DATA SPLITTING 80 20

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset

data = pd.read_csv('randomdata.csv')

# Assuming 'Churn' is your target variable and you want to predict it
X = data.drop(columns=['Churn'])  # Features
y = data['Churn']  # Target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (160000, 11)
X_test shape: (40000, 11)
y_train shape: (160000,)
y_test shape: (40000,)


DATA SPLITTING 75 25

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset

data = pd.read_csv('randomdata.csv')

# Assuming 'Churn' is your target variable and you want to predict it
X = data.drop(columns=['Churn'])  # Features
y = data['Churn']  # Target variable

# Split the data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the sizes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (150000, 11)
X_test shape: (50000, 11)
y_train shape: (150000,)
y_test shape: (50000,)


DATA SPLITTING 85 15

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
# Replace 'randomdata.csv' with the actual path to your dataset
data = pd.read_csv('randomdata.csv')

# Assuming 'Churn' is your target variable and you want to predict it
X = data.drop(columns=['Churn'])  # Features
y = data['Churn']  # Target variable

# Split the data into training and testing sets (85% training, 15% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Display the sizes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (170000, 11)
X_test shape: (30000, 11)
y_train shape: (170000,)
y_test shape: (30000,)


# FEATURE SELECTION - Ensemble method

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('randomdata.csv')

# Assuming 'Churn' is your target variable and you want to predict it
# Drop non-numeric columns and target variable
X = data.drop(columns=['Churn', 'Customer Name', 'Customer_Address'])
y = data['Churn']  # Target variable

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object':
        X[column] = label_encoder.fit_transform(X[column])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to your data
rf_classifier.fit(X_train, y_train)

# Evaluate the classifier on the testing data
accuracy = rf_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report


data = pd.read_csv('randomdata.csv')

# Assuming 'Churn' is your target variable and you want to predict it
# Drop non-numeric columns and target variable
X = data.drop(columns=['Churn', 'Customer Name', 'Customer_Address'])
y = data['Churn']  # Target variable

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object':
        X[column] = label_encoder.fit_transform(X[column])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform oversampling on the training data
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Initialize a Deep Learning model (MLPClassifier is used here as an example)
clf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Fit the classifier to your resampled training data
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report_result)


Accuracy: 0.6986
Classification Report:
              precision    recall  f1-score   support

          No       0.72      0.28      0.41     14536
         Yes       0.70      0.94      0.80     25464

    accuracy                           0.70     40000
   macro avg       0.71      0.61      0.60     40000
weighted avg       0.70      0.70      0.66     40000



# DEEP LEARNING MODEL - Ensemble method

In [18]:
# Define a function to create a deep learning model
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [19]:
# Create three deep learning models
import numpy as np
from tensorflow.keras.models import Sequential  # Import the Sequential class
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



model1 = create_model(X_train.shape[1])
model2 = create_model(X_train.shape[1])
model3 = create_model(X_train.shape[1])


In [21]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable into numerical values (0 and 1)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Train the deep learning models
model1.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
model2.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
model3.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)



<keras.src.callbacks.History at 0x209c4001190>

In [22]:
# Make predictions using the individual models
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)




In [23]:
# Ensemble the predictions using a simple averaging method
ensemble_preds = np.round((pred1 + pred2 + pred3) / 3)


In [24]:
# Calculate accuracy of the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)

print("Ensemble Model Accuracy:", ensemble_accuracy)

Ensemble Model Accuracy: 0.7327


In [25]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

# Make predictions using the individual models
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)

# Ensemble the predictions using Voting Classifier
ensemble_preds = np.round((pred1 + pred2 + pred3) / 3)

# Calculate classification metrics
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
ensemble_precision = precision_score(y_test, ensemble_preds)
ensemble_f1 = f1_score(y_test, ensemble_preds)
ensemble_recall = recall_score(y_test, ensemble_preds)

# Print the evaluation metrics
print("Ensemble Model Metrics:")
print("Accuracy:", ensemble_accuracy)
print("Precision:", ensemble_precision)
print("F1 Score:", ensemble_f1)
print("Recall:", ensemble_recall)


Ensemble Model Metrics:
Accuracy: 0.7327
Precision: 0.7121316560399793
F1 Score: 0.8226336219767095
Recall: 0.973727615457116


Experimenting with another ensemble method - bagging

In [26]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

# Create a BaggingClassifier with a RandomForest base estimator
bagging_classifier = BaggingClassifier(
    base_estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    n_estimators=10, random_state=42)

# Fit the bagging ensemble to the data
bagging_classifier.fit(X_train, y_train)

# Evaluate the bagging ensemble
bagging_accuracy = bagging_classifier.score(X_test, y_test)

print("Bagging Ensemble Accuracy:", bagging_accuracy)






Bagging Ensemble Accuracy: 1.0
