In [1]:
!pip install -q gradio pandas numpy scikit-learn


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import random

def generate_student_dataset(student_id):
    try:
        seed_val = int(student_id)
    except ValueError:
        print("Error: Student ID must be an integer.")
        return None

    np.random.seed(seed_val)
    random.seed(seed_val)

    n_samples = 1000
    age = np.random.normal(35, 12, n_samples).astype(int)
    income = np.random.exponential(50000, n_samples) + 20000
    credit_score = np.random.randint(300, 850, n_samples)
    years_loyalty = np.random.poisson(3, n_samples)
    support_tickets = np.random.poisson(1.5, n_samples)
    gender = np.random.choice(['Male', 'Female'], n_samples)
    contract = np.random.choice(['Month-to-Month', 'One Year', 'Two Year'], n_samples, p=[0.5, 0.3, 0.2])
    internet = np.random.choice(['Saf Home', 'Faiba', 'No'], n_samples)
    payment = np.random.choice(['Pesa Pal', 'M-Pesa', 'Equitel', 'Cash'], n_samples)

    logit = (
        -0.05 * age +
        -0.00001 * income +
        -0.002 * credit_score +
        -0.5 * years_loyalty +
        0.8 * support_tickets +
        (np.where(contract == 'Month-to-Month', 2, 0)) +
        (np.where(internet == 'Faiba', 0.5, -0.5)) +
        np.random.normal(0, 2, n_samples)
    )

    prob = 1 / (1 + np.exp(-logit))
    target = (prob > 0.5).astype(int)

    df = pd.DataFrame({
        'Age': age,
        'Annual_Income': income,
        'Years_Loyalty': years_loyalty,
        'Num_Support_Tickets': support_tickets,
        'Gender': gender,
        'Contract_Type': contract,
        'Internet_Service': internet,
        'Payment_Method': payment,
        'Churn': target
    })

    for col in ['Annual_Income', 'Internet_Service']:
        mask = np.random.choice([True, False], size=n_samples, p=[0.10, 0.90])
        df.loc[mask, col] = np.nan

    outlier_indices = np.random.choice(df.index, size=5, replace=False)
    df.loc[outlier_indices, 'Annual_Income'] = df.loc[outlier_indices, 'Annual_Income'] * 10

    typo_indices = np.random.choice(df.index, size=3, replace=False)
    df.loc[typo_indices, 'Age'] = -1 * df.loc[typo_indices, 'Age']

    print(f"Dataset generated for Student ID: {student_id}")
    print(f"Shape: {df.shape}")
    return df

In [2]:


student_id =  167052
df = generate_student_dataset(student_id)
df.head()


Dataset generated for Student ID: 167052
Shape: (1000, 9)


Unnamed: 0,Age,Annual_Income,Years_Loyalty,Num_Support_Tickets,Gender,Contract_Type,Internet_Service,Payment_Method,Churn
0,33,70985.704785,3,2,Male,One Year,,Pesa Pal,0
1,44,31231.643101,1,4,Male,One Year,Saf Home,Equitel,0
2,38,60829.013503,1,2,Female,Month-to-Month,No,M-Pesa,0
3,30,20179.133953,6,4,Female,One Year,Saf Home,M-Pesa,0
4,27,222138.661273,3,3,Male,One Year,Faiba,Pesa Pal,0


In [3]:

print("Shape:", df.shape)
print("\nInfo:")
df.info()

print("\nDescriptive statistics:")
df.describe(include='all')


Shape: (1000, 9)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1000 non-null   int64  
 1   Annual_Income        904 non-null    float64
 2   Years_Loyalty        1000 non-null   int32  
 3   Num_Support_Tickets  1000 non-null   int32  
 4   Gender               1000 non-null   object 
 5   Contract_Type        1000 non-null   object 
 6   Internet_Service     911 non-null    object 
 7   Payment_Method       1000 non-null   object 
 8   Churn                1000 non-null   int64  
dtypes: float64(1), int32(2), int64(2), object(4)
memory usage: 62.6+ KB

Descriptive statistics:


Unnamed: 0,Age,Annual_Income,Years_Loyalty,Num_Support_Tickets,Gender,Contract_Type,Internet_Service,Payment_Method,Churn
count,1000.0,904.0,1000.0,1000.0,1000,1000,911,1000,1000.0
unique,,,,,2,3,3,4,
top,,,,,Female,Month-to-Month,Faiba,M-Pesa,
freq,,,,,521,494,313,263,
mean,34.237,67835.978242,2.962,1.513,,,,,0.147
std,12.733537,50337.448755,1.773048,1.251158,,,,,0.354283
min,-39.0,20003.774025,0.0,0.0,,,,,0.0
25%,26.0,33421.596745,2.0,1.0,,,,,0.0
50%,35.0,51831.006509,3.0,1.0,,,,,0.0
75%,43.0,84388.6178,4.0,2.0,,,,,0.0


In [4]:
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_counts


Annual_Income          96
Internet_Service       89
Age                     0
Num_Support_Tickets     0
Years_Loyalty           0
Gender                  0
Contract_Type           0
Payment_Method          0
Churn                   0
dtype: int64

In [5]:
df[df['Age'] < 0]


Unnamed: 0,Age,Annual_Income,Years_Loyalty,Num_Support_Tickets,Gender,Contract_Type,Internet_Service,Payment_Method,Churn
136,-33,,5,3,Male,Month-to-Month,Saf Home,Cash,0
217,-36,,0,0,Male,Two Year,No,Cash,0
290,-5,48324.951874,5,1,Male,Month-to-Month,Faiba,Equitel,0
387,-2,,3,1,Female,Month-to-Month,No,Cash,0
421,-4,35251.490184,3,0,Female,Month-to-Month,No,Cash,0
515,-4,65733.40035,3,0,Female,Month-to-Month,Saf Home,Pesa Pal,0
533,-3,61427.162181,5,0,Female,Month-to-Month,No,Pesa Pal,0
840,-39,101711.083346,2,1,Male,One Year,Faiba,M-Pesa,0


In [6]:
df_clean = df.copy()
df_clean.loc[df_clean['Age'] < 0, 'Age'] = abs(df_clean.loc[df_clean['Age'] < 0, 'Age'])

df_clean.head()


Unnamed: 0,Age,Annual_Income,Years_Loyalty,Num_Support_Tickets,Gender,Contract_Type,Internet_Service,Payment_Method,Churn
0,33,70985.704785,3,2,Male,One Year,,Pesa Pal,0
1,44,31231.643101,1,4,Male,One Year,Saf Home,Equitel,0
2,38,60829.013503,1,2,Female,Month-to-Month,No,M-Pesa,0
3,30,20179.133953,6,4,Female,One Year,Saf Home,M-Pesa,0
4,27,222138.661273,3,3,Male,One Year,Faiba,Pesa Pal,0


In [7]:
df_clean['Annual_Income'] = df_clean['Annual_Income'].fillna(df_clean['Annual_Income'].median())

df_clean['Internet_Service'] = df_clean['Internet_Service'].fillna(df_clean['Internet_Service'].mode()[0])

df_clean.isna().sum()


Age                    0
Annual_Income          0
Years_Loyalty          0
Num_Support_Tickets    0
Gender                 0
Contract_Type          0
Internet_Service       0
Payment_Method         0
Churn                  0
dtype: int64

In [8]:
X = df_clean.drop(columns=['Churn'])
y = df_clean['Churn']

X.head(), y.head()


(   Age  Annual_Income  Years_Loyalty  Num_Support_Tickets  Gender  \
 0   33   70985.704785              3                    2    Male   
 1   44   31231.643101              1                    4    Male   
 2   38   60829.013503              1                    2  Female   
 3   30   20179.133953              6                    4  Female   
 4   27  222138.661273              3                    3    Male   
 
     Contract_Type Internet_Service Payment_Method  
 0        One Year            Faiba       Pesa Pal  
 1        One Year         Saf Home        Equitel  
 2  Month-to-Month               No         M-Pesa  
 3        One Year         Saf Home         M-Pesa  
 4        One Year            Faiba       Pesa Pal  ,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: Churn, dtype: int64)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((800, 8), (200, 8))

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_cols = ['Age', 'Annual_Income', 'Years_Loyalty', 'Num_Support_Tickets']
cat_cols = ['Gender', 'Contract_Type', 'Internet_Service', 'Payment_Method']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)


In [11]:
from sklearn.linear_model import LogisticRegression

log_reg_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
from sklearn.tree import DecisionTreeClassifier

tree_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', DecisionTreeClassifier(max_depth=6, random_state=42))
])

tree_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
from sklearn.metrics import accuracy_score, f1_score

models = {
    "Logistic Regression": log_reg_model,
    "Decision Tree": tree_model
}

for name, model in models.items():
    preds = model.predict(X_test)
    print(name,
          "Accuracy:", accuracy_score(y_test, preds),
          "F1:", f1_score(y_test, preds))


Logistic Regression Accuracy: 0.875 F1: 0.358974358974359
Decision Tree Accuracy: 0.865 F1: 0.3076923076923077


In [14]:
import joblib

best_model = max(models.items(), key=lambda m: f1_score(y_test, m[1].predict(X_test)))[1]
joblib.dump(best_model, "best_churn_model.joblib")


['best_churn_model.joblib']

In [15]:
import gradio as gr
import pandas as pd

def predict_churn(Age, Annual_Income, Years_Loyalty, Num_Support_Tickets,
                  Gender, Contract_Type, Internet_Service, Payment_Method):

    sample = pd.DataFrame([[
        Age, Annual_Income, Years_Loyalty, Num_Support_Tickets,
        Gender, Contract_Type, Internet_Service, Payment_Method
    ]], columns=X.columns)

    pred = best_model.predict(sample)[0]
    return "High Risk of Churn" if pred == 1 else "Low Risk of Churn"

ui = gr.Interface(
    fn=predict_churn,
    inputs=[
        gr.Number(label="Age"),
        gr.Number(label="Annual_Income"),
        gr.Number(label="Years_Loyalty"),
        gr.Number(label="Num_Support_Tickets"),
        gr.Dropdown(["Male", "Female"], label="Gender"),
        gr.Dropdown(["Month-to-Month", "One Year", "Two Year"], label="Contract Type"),
        gr.Dropdown(["Saf Home", "Faiba", "No"], label="Internet Service"),
        gr.Dropdown(["Pesa Pal", "M-Pesa", "Equitel", "Cash"], label="Payment Method")
    ],
    outputs="text",
    title="Churn Predictor"
)

ui.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Exception in callback _ProactorBasePipeTransport._call_connection_lost()
handle: <Handle _ProactorBasePipeTransport._call_connection_lost()>
Traceback (most recent call last):
  File "C:\Python313\Lib\asyncio\events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python313\Lib\asyncio\proactor_events.py", line 165, in _call_connection_lost
    self._sock.shutdown(socket.SHUT_RDWR)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
