## **Import Packages**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## **Call File (Replace 'ADD IN NEW FILE NAME' with actual 'File Name')**


In [None]:
user_behavior_train = pd.read_csv('user_behavior_final.csv', low_memory=False)
new_user_behavior= pd.read_csv('june13-july13_user_behavior.csv', low_memory=False)

# **Data Preparation and Training**

**Data preparation for unseen data**

In [None]:
#Seperate 'iduser' in new_user_behavior, and create a sepereate dataframe of 'iduser'
new_user_behavior_no_iduser = new_user_behavior.drop(columns=['iduser'])
# Get a copy of the new_user_behavior file
iduser_new_user_behavior = new_user_behavior[['iduser']].copy()


**Data preparation for model training**

In [None]:
# Replace column name
user_behavior_train = user_behavior_train.rename(columns={'paid_user': 'churn'})

# Replace 0 with 1 and 1 with 0 in the 'churn' column
user_behavior_train['churn'] = user_behavior_train['churn'].replace({0: 1, 1: 0})

In [None]:
# Calculate the VIF scores
vif = pd.DataFrame()
vif["Feature"] = user_behavior_train.columns
vif["VIF Score"] = [variance_inflation_factor(user_behavior_train.values, i) for i in range(user_behavior_train.shape[1])]

# Print the VIF scores
print(vif)

             Feature  VIF Score
0             iduser   4.610919
1              churn   3.337196
2       streak_count   1.438091
3        hero_area>1   2.000543
4       battle_count   4.190796
5         file_count   1.865816
6          num_items   5.697546
7      mission_count   3.586429
8   wardrobe_updated   1.849511
9          wordcount   2.629728
10    friends_amount   1.063740
11        chat_count   1.042952


In [None]:
# drop high correlation and less important column
columns_to_drop = ['num_items']
user_behavior_train = user_behavior_train.drop(columns_to_drop, axis=1)

In [None]:
X = user_behavior_train.drop(columns=['iduser', 'churn'])
y = user_behavior_train['churn']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=52)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=52)


# Print the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (24001, 9)
y_train shape: (24001,)
X_val shape: (10287, 9)
y_val shape: (10287,)
X_test shape: (14695, 9)
y_test shape: (14695,)


In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# Create an instance of Random Forest Classifier with the best parameters
best_parameters = {'bootstrap': True, 'criterion': 'entropy', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2}
model = RandomForestClassifier(**best_parameters)

# Fit the model on the training data
model.fit(X_train_resampled, y_train_resampled)

# **Apply on the New Data**

In [None]:
#Apply the trained model to the unseen dataset
predictions = model.predict(new_user_behavior_no_iduser)  # Replace unseen_data with your actual unseen dataset




In [None]:
# Get 'iduser' values for unseen data
iduser_values = iduser_new_user_behavior['iduser']

# Create a DataFrame to store the results
results_df = pd.DataFrame({
    'iduser': iduser_values,
    'churn_prediction': predictions
})

# Add the features used for prediction to the results DataFrame
results_df = pd.concat([results_df, new_user_behavior_no_iduser], axis=1)

#show result
results_df

# Save the DataFrame to an Excel file
results_df.to_excel('user_churn_predictions.xlsx', index=False)