In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [36]:
# Read the data from the CSV file and place it in a pandas DataFrame
df = pd.read_csv("customer_churn.csv")

# Display the first few rows of the DataFrame to confirm data has been loaded correctly
print(df.head())


              Names   Age  Total_Purchase  Account_Manager  Years  Num_Sites  \
0  Cameron Williams  42.0        11066.80                0   7.22        8.0   
1     Kevin Mueller  41.0        11916.22                0   6.50       11.0   
2       Eric Lozano  38.0        12884.75                0   6.67       12.0   
3     Phillip White  42.0         8010.76                0   6.71       10.0   
4    Cynthia Norton  37.0         9191.58                0   5.56        9.0   

          Onboard_date                                           Location  \
0  2013-08-30 07:00:40      10265 Elizabeth Mission Barkerburgh, AK 89518   
1  2013-08-13 00:38:46  6157 Frank Gardens Suite 019 Carloshaven, RI 1...   
2  2016-06-29 06:20:07             1331 Keith Court Alyssahaven, DE 90114   
3  2014-04-22 12:43:12       13120 Daniel Mount Angelabury, WY 30645-4695   
4  2016-01-19 15:31:15                765 Tricia Row Karenshire, MH 71730   

                       Company  Churn  
0               

In [37]:
# Check for null values in each column
null_counts = df.isnull().sum()

if null_counts.sum() == 0:
    print("***Data is clean*** \n")
else:
    print("***Data needs to be cleaned***")

# Display the number of null values for each column
print("Null values per column:")
print(null_counts)


***Data is clean*** 

Null values per column:
Names              0
Age                0
Total_Purchase     0
Account_Manager    0
Years              0
Num_Sites          0
Onboard_date       0
Location           0
Company            0
Churn              0
dtype: int64


In [38]:
# Columns to standardize
col = ['Age', 'Total_Purchase', 'Years', 'Num_Sites', 'Account_Manager']

# Standardize the selected columns
scaler = StandardScaler()

df[col] = scaler.fit_transform(df[col])

# Display the standardized columns
print(df[col])


          Age  Total_Purchase     Years  Num_Sites  Account_Manager
0    0.029936        0.417054  1.528446  -0.333235        -0.962910
1   -0.133352        0.769905  0.963182   1.367585        -0.962910
2   -0.623215        1.172234  1.096647   1.934526        -0.962910
3    0.029936       -0.852432  1.128051   0.800645        -0.962910
4   -0.786503       -0.361917  0.225198   0.233705        -0.962910
..        ...             ...       ...        ...              ...
895  0.029936        1.137369 -1.297874  -0.333235         1.038519
896  1.662814       -0.070163  1.285069  -0.900175        -0.962910
897  0.519800        0.828044  0.146690  -2.600995        -0.962910
898  1.499527       -1.472556  0.154540   0.800645         1.038519
899 -0.459927       -0.310398 -0.198750   0.800645         1.038519

[900 rows x 5 columns]


In [39]:
# Encode categorical columns using one-hot encoding
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Feature Extraction
# Extract date features from 'Onboard_date' column
if 'Onboard_date' in df_encoded.columns:
    df_encoded['Onboard_year'] = pd.to_datetime(df_encoded['Onboard_date']).dt.year
    df_encoded['Onboard_month'] = pd.to_datetime(df_encoded['Onboard_date']).dt.month
    df_encoded['Onboard_day'] = pd.to_datetime(df_encoded['Onboard_date']).dt.day
    df_encoded.drop(columns=['Onboard_date'], inplace=True)

# The DataFrame is now encoded and ready for feature extraction
print(df_encoded.head())

        Age  Total_Purchase  Account_Manager     Years  Num_Sites  Churn  \
0  0.029936        0.417054         -0.96291  1.528446  -0.333235      1   
1 -0.133352        0.769905         -0.96291  0.963182   1.367585      1   
2 -0.623215        1.172234         -0.96291  1.096647   1.934526      1   
3  0.029936       -0.852432         -0.96291  1.128051   0.800645      1   
4 -0.786503       -0.361917         -0.96291  0.225198   0.233705      1   

   Names_Aaron Meyer  Names_Aaron West  Names_Abigail Gonzalez  \
0              False             False                   False   
1              False             False                   False   
2              False             False                   False   
3              False             False                   False   
4              False             False                   False   

   Names_Abigail Jennings  ...  Company_Yates, Martinez and Cox  \
0                   False  ...                            False   
1           

In [40]:
# Define the features and target variable
X = df.drop(['Names', 'Company', 'Onboard_date', 'Location', 'Churn'], axis=1)
y = df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=58
)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (720, 5)
X_test shape: (180, 5)
y_train shape: (720,)
y_test shape: (180,)


In [41]:
# # Separate features and target
# X = df_encoded.drop(columns='Churn')
# y = df_encoded['Churn']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train a basic Random Forest classifier
# model = RandomForestClassifier(random_state=42)

# # Train the model on the training data
# model.fit(X_train, y_train)

# # Evaluate the model on the testing data
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)

# # Print the accuracy
# print(f"Model accuracy on testing data: {accuracy:.2f}")
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))


In [42]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

# Predict the target variable for the testing data
y_pred = logistic_model.predict(X_test)

# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X, y, cv=5)

# Calculate accuracy of the model on the testing data
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy and classification report
print(f"Logistic Regression model accuracy on testing data: {accuracy:.2f}")
print("Cross-validation scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Logistic Regression model accuracy on testing data: 0.90
Cross-validation scores: [0.87777778 0.88888889 0.88888889 0.9        0.91666667]
Mean accuracy: 0.8944444444444445

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       148
           1       0.77      0.62      0.69        32

    accuracy                           0.90       180
   macro avg       0.85      0.79      0.82       180
weighted avg       0.89      0.90      0.90       180



In [43]:
# Load new customers data
pre = pd.read_csv('new_customers.csv')
pre

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
0,Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd
1,Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME...",Cannon-Benson
2,Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson
3,Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch North Cynthialand, NC 64721",Sexton-Golden
4,Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,Unit 0789 Box 0734 DPO AP 39702,Wood LLC
5,Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,1148 Tina Stravenue Apt. 978 South Carlos TX 2...,Parks-Robbins


In [44]:

pre[col] = scaler.transform(pre[col])

# Ensure that the new data has the same columns as the training data
new_data = pre.drop(['Names', 'Company', 'Onboard_date', 'Location'], axis=1)  # Exclude non-feature columns

# Make predictions on new data
s_pred = logistic_model.predict(new_data)
s_pred

array([0, 1, 1, 1, 0, 1], dtype=int64)