In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle

In [3]:
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled


In [4]:
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [5]:
def select_k_best(X_train, X_test, y_train, k=5):
    selector = SelectKBest(f_regression, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

In [6]:
def train_models(X_train, y_train):
    models = {
        'Linear Regression': LinearRegression(),
        'SVR': SVR(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor()
    }
    
    best_models = {}
    
    for name, model in models.items():
        param_grid = {
            "Linear Regression": {},  
            "SVR": {'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'C': [10, 100], 'gamma': ["auto", "scale"]},
            "Decision Tree": {'criterion': ['mse', 'mae', 'friedman_mse'], 'max_features': ['auto', 'sqrt', 'log2'],
                              'splitter': ["best", "random"]},
            "Random Forest": {'criterion': ["squared_error", "absolute_error", 'friedman_mse', 'poisson'],
                              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 100]}
        }
        
        
        if name == "Linear Regression":
            param_grid[name] = [{}]

        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        
        best_models[name] = grid_search.best_estimator_
    
    return best_models

In [7]:
def evaluate_models(models, X_test, y_test):
    results = {'Model': [], 'R2 Score': []}
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        
        results['Model'].append(name)
        results['R2 Score'].append(r2)
    
    results_df = pd.DataFrame(results)
    return results_df

In [8]:
def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)


In [9]:
data = pd.read_csv("pre-E_comm.csv",index_col=None)
df2=data

df2 = pd.get_dummies(df2, drop_first=True)

X=df2.drop('Churn', 1)
y=df2['Churn']

X_train, X_test, y_train, y_test = split_data(X, y)

X_train_scaled, X_test_scaled = preprocess_data(X_train, X_test)

X_train_selected, X_test_selected = select_k_best(X_train_scaled, X_test_scaled, y_train, k=5)

best_models = train_models(X_train_selected, y_train)

results_df = evaluate_models(best_models, X_test_selected, y_test)
print(results_df)






               Model  R2 Score
0  Linear Regression  0.198120
1                SVR  0.262687
2      Decision Tree  0.619645
3      Random Forest  0.598817


In [10]:
best_model = RandomForestRegressor(n_estimators=100, max_depth=20)  
best_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=20)

In [11]:
save_model(best_model, 'regression_model.pkl')

In [13]:
best_model.predict(X)

array([0.94, 0.99, 0.97, ..., 0.11, 0.07, 0.03])