In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
np.random.seed(42)
data_size=350000

In [7]:
df = pd.DataFrame({
    'Age' : np.random.randint(18,70,data_size),
    'Gender' : np.random.choice(['Male','Female'],data_size),
    'Monthly_Spending' : np.random.uniform(10,500,data_size),
    'Total_Transaction' : np.random.randint(1,50,data_size),
    'Churn' : np.random.choice([0,1],data_size,p=[0.8,0.2])
})

In [9]:
df.columns

Index(['Age', 'Gender', 'Monthly_Spending', 'Total_Transaction', 'Churn'], dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350000 entries, 0 to 349999
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                350000 non-null  int64  
 1   Gender             350000 non-null  object 
 2   Monthly_Spending   350000 non-null  float64
 3   Total_Transaction  350000 non-null  int64  
 4   Churn              350000 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 13.4+ MB


In [13]:
df.isnull().sum()

Age                  0
Gender               0
Monthly_Spending     0
Total_Transaction    0
Churn                0
dtype: int64

In [15]:
encoder = OneHotEncoder(sparse_output=False,drop='first')
gender_encoder = encoder.fit_transform(df[['Gender']])

In [21]:
gender_df = pd.DataFrame(gender_encoder, columns=encoder.get_feature_names_out(['Gender']))

In [27]:
df = pd.concat([df.drop(columns=['Gender']),gender_df],axis=1)

In [29]:
df.head(2)

Unnamed: 0,Age,Monthly_Spending,Total_Transaction,Churn,Gender_Male
0,45,234.849092,32,1,0.0
1,18,286.409962,27,0,0.0


In [31]:
x=df.drop(columns=['Churn'])
y=df['Churn']

In [35]:
scaler= StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

In [37]:
smote = SMOTE(sampling_strategy=0.5,random_state=42)
x_resampled, y_resampled = smote.fit_resample(x,y)

In [39]:
X_train,X_test,Y_train,Y_test = train_test_split(x_resampled,y_resampled,test_size=0.2,random_state=42)

In [45]:
model= RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

In [47]:
model.fit(X_train,Y_train)

In [49]:
predictions = model.predict(X_test)

In [51]:
accuracy = accuracy_score(Y_test,predictions)

In [53]:
print("Accuracy:",accuracy)

Accuracy: 0.7604953870360216


In [55]:
print("Classification report: ")
print(classification_report(Y_test, predictions))

Classification report: 
              precision    recall  f1-score   support

           0       0.79      0.88      0.83     55973
           1       0.69      0.52      0.59     27921

    accuracy                           0.76     83894
   macro avg       0.74      0.70      0.71     83894
weighted avg       0.75      0.76      0.75     83894

