In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import pickle

In [3]:
df = pd.read_csv('../../Warehouse/Reviews/app_reviews_merged.csv')

In [4]:
# Clean the data
df = df.dropna(subset=['score', 'thumbsUpCount', 'reviewCreatedVersion'])
df = df.reset_index(drop=True)

# Encode categorical variable 'reviewCreatedVersion'
le = LabelEncoder()
df['reviewCreatedVersion_encoded'] = le.fit_transform(df['reviewCreatedVersion'])

# Normalize numerical variables
scaler = MinMaxScaler()
df[['thumbsUpCount_normalized']] = scaler.fit_transform(df[['thumbsUpCount']])

# Create feature matrix X and target variable y
X = df[['thumbsUpCount_normalized', 'reviewCreatedVersion_encoded']]
y = df['score']

In [5]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

# Predict the target variable using the testing dataset
y_pred = regression_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [8]:
pickle.dump(regression_model, open('model.pkl', 'wb'))

In [6]:
print("R-squared: ", r2)
print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)

# Analyze the coefficients and intercepts
coef_dict = {feature: coef for feature, coef in zip(X.columns, regression_model.coef_)}
print("Coefficients: ", coef_dict)
print("Intercept: ", regression_model.intercept_)

R-squared:  0.0047422929139170344
Mean Squared Error:  2.7091208058050067
Mean Absolute Error:  1.4321246906046519
Coefficients:  {'thumbsUpCount_normalized': -4.751381138861692, 'reviewCreatedVersion_encoded': -0.003954022270358937}
Intercept:  4.3059838488685385
