In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib


In [5]:
# Load datasets
train_data = pd.read_csv("C:/Users/Awoleye/Downloads/TU_train(in).csv")
test_data = pd.read_csv("C:/Users/Awoleye/Downloads/TU_test.csv")

# Drop irrelevant columns
drop_cols = ["LATITUDE", "LONGITUDE", "CITY", "STATE", "COUNTY",
             "CURR_ANN_AMT", "HAS_CHILDREN", "LENGTH_OF_RESIDENCE",
             "DAYS_TENURE", "CUST_ORIG_DATE", "Age Group", "HOME_MARKET_VALUE", "Income Bracket"]

# Encode marital status
encoder = LabelEncoder()
train_data["Marital Status"] = encoder.fit_transform(train_data["MARITAL_STATUS"])
test_data["Marital Status"] = encoder.transform(test_data["MARITAL_STATUS"])

In [6]:

train_data.head()

Unnamed: 0,AGE_IN_YEARS,INCOME,HAS_CHILDREN,LENGTH_OF_RESIDENCE,MARITAL_STATUS,HOME_MARKET_VALUE,HOME_OWNER,COLLEGE_DEGREE,GOOD_CREDIT,CURR_ANN_AMT,...,CUST_ORIG_DATE,LATITUDE,LONGITUDE,CITY,STATE,COUNTY,Age Group,Income Bracket,CHURN,Marital Status
0,45.139,80372.176,0,0.0,Unknown,Unknown,0,0,0,591.92736,...,8/25/2022,0.0,0.0,Unknown,Unknown,Unknown,41-50,High,0,2
1,59.053,250000.0,1,14.0,Married,250000 - 274999,1,1,1,1277.431548,...,8/22/2012,32.977974,-97.13974,Southlake,TX,Tarrant,51-60,Very High,1,0
2,52.389,80372.176,0,6.801,Unknown,175000 - 199999,0,0,1,1108.684322,...,10/01/2017,0.0,0.0,Unknown,Unknown,Unknown,51-60,High,0,2
3,52.222,62500.0,1,7.0,Married,200000 - 224999,1,0,1,1650.580003,...,7/30/2022,33.230323,-96.650121,Mckinney,TX,Collin,51-60,High,1,0
4,48.474,80372.176,0,6.801,Single,25000 - 49999,0,0,0,829.123488,...,08/12/2022,32.794454,-96.785967,Dallas,TX,Dallas,41-50,High,1,1


In [8]:
# Separate features and target
X_train = train_data.drop("CHURN", axis=1)
y_train = train_data["CHURN"]
X_test = test_data[X_train.columns]


In [9]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
X_train_scaled

array([[-0.68890235,  0.01875529, -1.29119425, -0.58578644, -1.73142385,
         1.39009088],
       [ 0.43402277,  3.61149369,  0.77447681,  1.70710678,  0.57755933,
        -1.0020617 ],
       [-0.10379332,  0.01875529, -1.29119425, -0.58578644,  0.57755933,
         1.39009088],
       ...,
       [ 0.14275953,  0.01875529, -1.29119425, -0.58578644,  0.57755933,
         1.39009088],
       [ 0.14275953,  0.01875529, -1.29119425, -0.58578644,  0.57755933,
         1.39009088],
       [ 0.62932813,  3.61149369,  0.77447681,  1.70710678,  0.57755933,
        -1.0020617 ]])

In [20]:
# save preprocessing tools
joblib.dump(scaler, r"C:\Users\Awoleye\Documents\scaler1_model.pkl")



['C:\\Users\\Awoleye\\Documents\\scaler1_model.pkl']

In [22]:
joblib.dump(encoder, r"C:\Users\Awoleye\Documents\marital_encoder1.pkl")

['C:\\Users\\Awoleye\\Documents\\marital_encoder1.pkl']

In [17]:
#Train RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)



In [27]:
joblib.dump(model, r"C:\Users\Awoleye\Documents\model 1.pkl")

['C:\\Users\\Awoleye\\Documents\\model 1.pkl']

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate on training data
y_pred_train = model.predict(X_train_scaled)

print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))

print("\nClassification Report:")
print(classification_report(y_train, y_pred_train))


Training Confusion Matrix:
[[ 60649      0]
 [     0 154758]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     60649
           1       1.00      1.00      1.00    154758

    accuracy                           1.00    215407
   macro avg       1.00      1.00      1.00    215407
weighted avg       1.00      1.00      1.00    215407



In [121]:
def make_prediction(input_data: dict):
    import pandas as pd
    import joblib

    model = joblib.load(r"C:\Users\Awoleye\Documents\model 1.pkl")
    scaler = joblib.load(r"C:\Users\Awoleye\Documents\scaler1_model.pkl")
    encoder = joblib.load(r"C:\Users\Awoleye\Documents\marital_encoder1.pkl")

def make_prediction(input_data: dict):
    input_df = pd.DataFrame([input_data])

    numerical_cols = ['AGE_IN_YEARS', 'INCOME', 'HOME_OWNER', 'COLLEGE_DEGREE', 'GOOD_CREDIT', 'Marital Status']
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    # Predict
    prediction = model.predict(input_df)[0]
    probability = model.predict_proba(input_df)[0, 1]

    return {
        "Churn Status": "Churn" if prediction == 1 else "No Churn",
        "Probability": probability
    }


In [131]:
sample_input = {
    "AGE_IN_YEARS": 55.444,
    "INCOME": 80372.176,
    "HOME_OWNER": 0,
    "COLLEGE_DEGREE": 0,
    "GOOD_CREDIT": 0,
    "Marital Status": 2  # Must match what the encoder was trained on
}


In [132]:
result = make_prediction(sample_input)
print("Prediction:", result)


Prediction: {'Churn Status': 'No Churn', 'Probability': 0.00026696781141498604}




In [107]:
train_data.head(10)

Unnamed: 0,AGE_IN_YEARS,INCOME,HOME_OWNER,COLLEGE_DEGREE,GOOD_CREDIT,CHURN,Marital Status
0,45.139,80372.176,0,0,0,0,2
1,59.053,250000.0,1,1,1,1,0
2,52.389,80372.176,0,0,1,0,2
3,52.222,62500.0,1,0,1,1,0
4,48.474,80372.176,0,0,0,1,1
5,92.389,27500.0,1,1,0,1,0
6,41.139,37500.0,1,1,1,1,0
7,82.968,22500.0,1,0,0,0,2
8,54.053,70000.0,0,1,1,1,1
9,55.444,62500.0,1,0,1,1,0


In [128]:
test_data.head(10)

Unnamed: 0,AGE_IN_YEARS,INCOME,HOME_OWNER,COLLEGE_DEGREE,GOOD_CREDIT,Marital Status
0,55.444,80372.176,0,0,0,2
1,77.725,125000.0,1,0,1,0
2,70.138,70000.0,1,0,1,0
3,48.726,70000.0,1,0,1,1
4,70.557,87500.0,1,0,1,0
5,37.139,87500.0,1,1,1,1
6,55.444,80372.176,0,0,1,2
7,55.444,32500.0,0,0,1,2
8,55.444,80372.176,0,0,0,2
9,42.22,70000.0,1,1,1,0
