In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# Load datasets
df_final_demo_cleanead = pd.read_csv(r'C:\Users\VSPC\Documents\GitHub\New-Web-Study\data\df_final_demo_cleanead.csv')
df_web_data_total = pd.read_csv(r'C:\Users\VSPC\Documents\GitHub\New-Web-Study\data\df_web_data_total.csv')
df_final_experiment_clients_1 = pd.read_csv(r'C:\Users\VSPC\Documents\GitHub\New-Web-Study\data\raw\df_final_experiment_clients.txt')

# Merging the demo and web data on client_id
merged_data = pd.merge(df_final_demo_cleanead, df_web_data_total, on='client_id', how='inner')

# Merging the experiment clients data to add the target variable 'Variation'
final_data = pd.merge(merged_data, df_final_experiment_clients_1, on='client_id', how='left')

# Dropping rows where 'Variation' is NaN
final_data_clean = final_data.dropna(subset=['Variation'])

# Encoding the categorical features 'gender' and 'process_step'
label_encoder = LabelEncoder()
final_data_clean['gender'] = label_encoder.fit_transform(final_data_clean['gender'])
final_data_clean['process_step'] = label_encoder.fit_transform(final_data_clean['process_step'])

# Encoding the target variable 'Variation'
final_data_clean['Variation'] = label_encoder.fit_transform(final_data_clean['Variation'])

# Define the characteristics (X) and the target (y)
X = final_data_clean.drop(columns=['Variation', 'client_id', 'visitor_id', 'visit_id', 'date_time', 'date', 'time'])
y = final_data_clean['Variation']

# Split data in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LGBMClassifier
lgb_model = lgb.LGBMClassifier(objective='binary', is_unbalance=True, random_state=42)

# Define params
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'feature_fraction': [0.8, 0.9, 1.0]
}

# Config GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,  # Cross-validation con 5 particiones
    verbose=1,
    n_jobs=-1
)

# Train model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Madre predictions
y_pred_lgb = best_model.predict(X_test)

# Calculate accuracy
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)

print(f"Best params: {grid_search.best_params_}")
print(f"Model accuracy after hyperparameter tuning: {accuracy_lgb}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_clean['gender'] = label_encoder.fit_transform(final_data_clean['gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_clean['process_step'] = label_encoder.fit_transform(final_data_clean['process_step'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_clean['Varia

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Number of positive: 141094, number of negative: 112614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 754
[LightGBM] [Info] Number of data points in the train set: 253708, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.556128 -> initscore=0.225460
[LightGBM] [Info] Start training from score 0.225460
Best params: {'feature_fraction': 1.0, 'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 100}
Model accuracy after hyperparameter tuning: 0.7859901934507386


In [3]:
# Get feature importance
importance = best_model.feature_importances_

# Get feature names (assuming X_train is a DataFrame)
feature_names = X_train.columns

# Create a DataFrame for better readability
import pandas as pd
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)

                Feature  Importance
5               balance        6033
2                   age        4787
1  seniority_per_months        3910
6         calls_6_month        1646
0   seniority_per_years        1425
3                gender         815
4    number_of_accounts         583
7        logons_6_month         544
8          process_step          57


In [4]:
df_final_experiment_clients_1

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


In [5]:
# Adjust again LabelEncoder 
final_data['gender'].fillna('Unknown', inplace=True)
final_data['process_step'].fillna('Unknown', inplace=True)

# Adjust LabelEnconder for'gender' y 'process_step'
label_encoder_gender = LabelEncoder()
label_encoder_gender.fit(final_data['gender'])

label_encoder_process_step = LabelEncoder()
label_encoder_process_step.fit(final_data['process_step'])

# Filter fills with NA
missing_variation_data = final_data[final_data['Variation'].isna()]

# Apply LabelEnconder
missing_variation_data['gender'] = label_encoder_gender.transform(missing_variation_data['gender'])
missing_variation_data['process_step'] = label_encoder_process_step.transform(missing_variation_data['process_step'])

# Define characteristics (X) for rows with NaN in 'Variation'
X_missing = missing_variation_data.drop(columns=['Variation', 'client_id', 'visitor_id', 'visit_id', 'date_time', 'date', 'time'])

# Predict missing values ​​using the best trained model
predicted_variation = best_model.predict(X_missing)

# Complete NaN with values
final_data.loc[final_data['Variation'].isna(), 'Variation'] = label_encoder.inverse_transform(predicted_variation)

# Print dataframe
final_data.head(10)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_data['gender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_data['process_step'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

Unnamed: 0,client_id,seniority_per_years,seniority_per_months,age,gender,number_of_accounts,balance,calls_6_month,logons_6_month,visitor_id,visit_id,process_step,date_time,date,time,Variation
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:51:13,2017-04-02,11:51:13,Test
1,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:47:50,2017-04-02,11:47:50,Test
2,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:46:45,2017-04-02,11:46:45,Test
3,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,step_3,2017-04-02 11:23:08,2017-04-02,11:23:08,Test
4,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,step_2,2017-04-02 11:22:24,2017-04-02,11:22:24,Test
5,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,step_1,2017-04-02 11:21:38,2017-04-02,11:21:38,Test
6,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,228976764_46825473280_96584,start,2017-04-02 11:21:28,2017-04-02,11:21:28,Test
7,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 11:02:44,2017-03-29,11:02:44,Test
8,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 11:01:40,2017-03-29,11:01:40,Test
9,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 10:59:43,2017-03-29,10:59:43,Test


In [6]:
final_data['Variation'].value_counts()

gender
U    149867
M    149867
F    144036
X        14
Name: count, dtype: int64

In [7]:
# Export to .csv for have a clean dataframe

folder_path = r'C:\Users\VSPC\Documents\GitHub\New-Web-Study\data'

file_path = folder_path + "/df_final_clients_data.csv"

final_data.to_csv(file_path, index=False)

print(f"File saved to: {file_path}")

File saved to: C:\Users\VSPC\Documents\GitHub\New-Web-Study\data/df_final_clients_data.csv
