In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv('cleaned_data_no_outliers.csv')

In [3]:
X = data.drop(columns=["Total Time (hrs/day)" , "Apps Installed Category"])
y = data["Total Time (hrs/day)"]

In [4]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

numerical_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ]
)

In [6]:
ridge = Ridge()
rf = RandomForestRegressor()
gbr = GradientBoostingRegressor()

voting_regressor = VotingRegressor(estimators=[
    ("ridge", ridge),
    ("rf", rf),
    ("gbr", gbr)
])

In [7]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", voting_regressor)
])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.08664702814969508
R^2 Score: 0.9957892919317962


In [9]:
Predict = pd.DataFrame({
    "Age": [29],
    "Gender": ["Male"],
    "Location": ["Chennai"],
    "Phone Brand": ["Samsung"],
    "OS": ["Android"],
    "Screen Time (hrs/day)": [6.8],
    "Data Usage (GB/month)": [15.2],
    "Number of Apps Installed": [112],
    "Social Media Time (hrs/day)": [3.5],
    "E-commerce Spend (INR/month)": [1200],
    "Streaming Time (hrs/day)": [4.2],
    "Gaming Time (hrs/day)": [2.1],
    "Monthly Recharge Cost (INR)": [850],
    "Primary Use": ["Entertainment"],
    "Calls Duration (hrs/day)": [2.1],
})

predicted_time = model.predict(Predict)
print(f"Predicted Total Time (hrs/day): {predicted_time[0]}")

Predicted Total Time (hrs/day): 18.870138466266756


In [10]:
import joblib
filename = 'model.pkl'
joblib.dump(model, filename)

['model.pkl']