In [1]:
import pandas as pd

# Step 1: Data Preprocessing
# Load the dataset
data = pd.read_excel("./customer_churn_large_dataset.xlsx")

# Initial exploration
print(data.head())
print(data.info())
print(data.describe())




   CustomerID        Name  Age  Gender     Location  \
0           1  Customer_1   63    Male  Los Angeles   
1           2  Customer_2   62  Female     New York   
2           3  Customer_3   24  Female  Los Angeles   
3           4  Customer_4   36  Female        Miami   
4           5  Customer_5   46  Female        Miami   

   Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  Churn  
0                          17         73.36             236      0  
1                           1         48.76             172      0  
2                           5         85.47             460      0  
3                           3         97.94             297      1  
4                          19         58.14             266      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID               

**STEP:1 Data preprocessing:** 

In [2]:
#dropping CustomerID , Name
data.drop(columns=['CustomerID','Name'], inplace=True)
print(data.columns)


Index(['Age', 'Gender', 'Location', 'Subscription_Length_Months',
       'Monthly_Bill', 'Total_Usage_GB', 'Churn'],
      dtype='object')


In [3]:
# seperating input and output data
X=data.drop('Churn',axis=1)
y=data['Churn']
print(X.head(),y.head())

   Age  Gender     Location  Subscription_Length_Months  Monthly_Bill  \
0   63    Male  Los Angeles                          17         73.36   
1   62  Female     New York                           1         48.76   
2   24  Female  Los Angeles                           5         85.47   
3   36  Female        Miami                           3         97.94   
4   46  Female        Miami                          19         58.14   

   Total_Usage_GB  
0             236  
1             172  
2             460  
3             297  
4             266   0    0
1    0
2    0
3    1
4    0
Name: Churn, dtype: int64


**2. Feature Engineering:**

In [4]:
# checking the unique locations to encode accourdingly
Unique_Locations= data['Location'].unique()
print(Unique_Locations)
encoded_Locations = pd.get_dummies(data['Location'],prefix='Location').astype(int)
print(encoded_Locations.head())

['Los Angeles' 'New York' 'Miami' 'Chicago' 'Houston']
   Location_Chicago  Location_Houston  Location_Los Angeles  Location_Miami  \
0                 0                 0                     1               0   
1                 0                 0                     0               0   
2                 0                 0                     1               0   
3                 0                 0                     0               1   
4                 0                 0                     0               1   

   Location_New York  
0                  0  
1                  1  
2                  0  
3                  0  
4                  0  


In [5]:
#dropping Original Locaton column
X.drop('Location',axis=1,inplace=True)
X.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,Male,17,73.36,236
1,62,Female,1,48.76,172
2,24,Female,5,85.47,460
3,36,Female,3,97.94,297
4,46,Female,19,58.14,266


In [6]:
X=pd.concat([X,encoded_Locations],axis=1)
X.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,63,Male,17,73.36,236,0,0,1,0,0
1,62,Female,1,48.76,172,0,0,0,0,1
2,24,Female,5,85.47,460,0,0,1,0,0
3,36,Female,3,97.94,297,0,0,0,1,0
4,46,Female,19,58.14,266,0,0,0,1,0


In [7]:
# encoding Gender column into numerical data 
X['Gender']=(X['Gender']=='Male').astype(int)# Male==1 female == 0
X.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,63,1,17,73.36,236,0,0,1,0,0
1,62,0,1,48.76,172,0,0,0,0,1
2,24,0,5,85.47,460,0,0,1,0,0
3,36,0,3,97.94,297,0,0,0,1,0
4,46,0,19,58.14,266,0,0,0,1,0


In [81]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model to your data
rf_model.fit(X, y)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to show feature importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                      Feature  Importance
3                Monthly_Bill    0.324400
4              Total_Usage_GB    0.299598
0                         Age    0.193522
2  Subscription_Length_Months    0.146805
1                      Gender    0.012813
5            Location_Chicago    0.005008
7        Location_Los Angeles    0.004762
8              Location_Miami    0.004703
6            Location_Houston    0.004315
9           Location_New York    0.004073


In [8]:
X_without_location= X.drop(columns=encoded_Locations.columns)

X_without_location.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,1,17,73.36,236
1,62,0,1,48.76,172
2,24,0,5,85.47,460
3,36,0,3,97.94,297
4,46,0,19,58.14,266


In [9]:
data_preprocessed=pd.concat([X_without_location,y],axis=1)
data_preprocessed.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,1,17,73.36,236,0
1,62,0,1,48.76,172,0
2,24,0,5,85.47,460,0
3,36,0,3,97.94,297,1
4,46,0,19,58.14,266,0


In [94]:
#saving this new data in a csv file
data_preprocessed.to_csv("./preprocessed_customer_churn_large_dataset.csv",index=False)

In [10]:
data_preprocessed2=pd.concat([X,y],axis=1)
data_preprocessed2.head()

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York,Churn
0,63,1,17,73.36,236,0,0,1,0,0,0
1,62,0,1,48.76,172,0,0,0,0,1,0
2,24,0,5,85.47,460,0,0,1,0,0,0
3,36,0,3,97.94,297,0,0,0,1,0,1
4,46,0,19,58.14,266,0,0,0,1,0,0


In [102]:
#saving this new data in a csv file
data_preprocessed2.to_csv("./preprocessed_customer_churn_large_dataset_with_location.csv",index=False)

**3. Model Building:**

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Train a machine learning model, evaluate its performance, and return the model and metrics.
    
    Parameters:
    model (object): A machine learning model instance.
    X_train (DataFrame): Training feature data.
    y_train (Series): Training labels.
    X_test (DataFrame): Testing feature data.
    y_test (Series): Testing labels.
    
    Returns:
    trained_model (object): Trained machine learning model.
    metrics (dict): Dictionary containing performance metrics.
    """
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Create a dictionary to store metrics
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    return model, metrics


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# data=pd.read_csv('./preprocessed_customer_churn_large_dataset.csv') #data without locations
data=pd.read_csv('./preprocessed_customer_churn_large_dataset_with_location.csv')#data without locations 
# Separate features and target
X = data.drop('Churn', axis=1)
y = data['Churn']
# Load your data and split it into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X_without_location, y, test_size=0.2, random_state=42)
# Initialize a list to store models and their metrics
models = []

# Initialize different machine learning models
random_forest = RandomForestClassifier()
svm = SVC()
logistic_regression = LogisticRegression()

# Loop through each model and train/evaluate
for model in [random_forest, svm, logistic_regression]:
    trained_model, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
    models.append((trained_model, metrics))

# Compare model performance
for i, (model, metrics) in enumerate(models):
    print(f"Model {i + 1}: {type(model).__name__}")
    print("Accuracy:", metrics['accuracy'])
    print("Precision:", metrics['precision'])
    print("Recall:", metrics['recall'])
    print("F1-Score:", metrics['f1_score'])
    print()


Model 1: RandomForestClassifier
Accuracy: 0.50125
Precision: 0.4971246006389776
Recall: 0.47051708497127304
F1-Score: 0.48345502563305887

Model 2: SVC
Accuracy: 0.50435
Precision: 0.5016353229762878
Recall: 0.12367704868460841
F1-Score: 0.19843130912913398

Model 3: LogisticRegression
Accuracy: 0.49985
Precision: 0.4930649526387009
Recall: 0.2938211873803044
F1-Score: 0.36821827827954273



In [18]:
import joblib  # for scikit-learn version >= 0.24

# Loop through each trained model in the list and save them
for i, (model, metrics) in enumerate(models):
    model_name = type(model).__name__
    num_features = len(model.feature_names_in_)
    model_filename = f"../Trained_models/{model_name}_{num_features}_cols.pkl"
    joblib.dump(model, model_filename)
    print(f"Model {i + 1} ({model_name}) saved as: {model_filename}")


Model 1 (RandomForestClassifier) saved as: ../Trained_models/RandomForestClassifier_5_cols.pkl
Model 2 (SVC) saved as: ../Trained_models/SVC_5_cols.pkl
Model 3 (LogisticRegression) saved as: ../Trained_models/LogisticRegression_5_cols.pkl


**data with locatios**

In [97]:
import joblib
joblib.dump(test1_rf_model,'../Trained_models/test1_rf_model.pkl')

['./Trained_models/test1_rf_model.pkl']

In [99]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
def print_performence(model,y_test,y_pred):
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    
    # Calculate precision
    precision = precision_score(y_test, y_pred)
    print("Precision:", precision)
    
    # Calculate recall
    recall = recall_score(y_test, y_pred)
    print("Recall:", recall)
    
    # Calculate F1-score
    f1 = f1_score(y_test, y_pred)
    print("F1-Score:", f1)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)


In [98]:
from sklearn.model_selection import train_test_split
import joblib
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

test2_rf_model = RandomForestClassifier(random_state=42)
test2_rf_model.fit(X_train,y_train)
#saving the model
joblib.dump(test2_rf_model,'./Trained_models/test1_rf_model.pkl')

['./Trained_models/test1_rf_model.pkl']

In [100]:
print_performence(test2_rf_model,y_test,y_pred)

Accuracy: 0.49215
Precision: 0.48773388773388776
Recall: 0.47293619594798914
F1-Score: 0.4802210736400389
Confusion Matrix:
 [[5151 4928]
 [5229 4692]]
