In [None]:
# import library
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# load dataset
train_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
#print('data Logs: ')
#print(train_logs.head())
#print('\nData Scores: ')
#print(train_scores.head())


# analisis distribusi skor pada data latih
#plt.figure(figsize=(8, 5))
#sns.countplot(x='score', data=train_scores)
#plt.title('Distribusi skor pada data latih')
#plt.show()

# cek data yang hilang pada data logs
#missing_data = train_logs.isnull().sum()
#print("Data yang Hilang pad Data Logs: ")
#print(missing_data)

# Analisis Outliers pada data skor
#plt.figure(figsize=(8, 5))
#sns.boxplot(x='score', data=train_scores)
#plt.title('Boxplot Skor Pada data Latih')
#plt.show


# Menambahkan kolom total_changes
train_logs['total_changes'] = train_logs['text_change'].apply(lambda x: 1 if x != 'NoChange' else 0)
print(train_logs.tail())

# Gabungkan data latih dan skor berdasarkan kolom 'id'
train_data = pd.merge(train_logs, train_scores, on='id')

# pisahkan data menjadi set pelatihan dan validasi
train, valid = train_test_split(train_data, test_size=0.2, random_state=42)
#print("Data Pelatihan: ")
#print(train.head())
#print("\nData Validasi: ")
#print(valid.head())


# memilih fitur yang akan digunakan untuk melatih model
features = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count', 'total_changes']

# memisahkan data menjadi fitur dan target
X_train = train[features]
y_train = train['score']
X_valid = valid[features]
y_valid = valid['score']

# Inisialisasi model dan latih model
model = LinearRegression()
model.fit(X_train, y_train)

# Melakukan prediksi pada set validasi
y_pred = model.predict(X_valid)

# Evaluasi model menggunakan MSE
mse = mean_squared_error(y_valid, y_pred)
print(f'Mean Squared Error : {mse}')


# Inisialisasi StandardScaler
scaler = StandardScaler()

# Skalan fitur numerik pada data pelatihan
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

#Inisialisai model dan latih model dengan data tyang telah di ubah skalanya
model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)

# Melaukan prediksi pada set validasi yang telah di ubah skalanya
y_pred_scaled = model_scaled.predict(X_valid_scaled)

# Evaluasi model menggunakan MSE
mse_scaled = mean_squared_error(y_valid, y_pred_scaled)
print(f'Mean Squared Error setelah skala fitur numerik : {mse_scaled}')


# Inisialisai model Regerssi lasso (digunakan karena hasil sebelumnya kurang memuaskan)
model_lasso = Lasso(alpha=0.1)
model_lasso.fit(X_train_scaled, y_train)

# Inisialisasi model regresi Ridge
model_ridge = Ridge(alpha=0.1)
model_ridge.fit(X_train_scaled, y_train)

# Lakukan prediksi pada set validasi
y_pred_lasso = model_lasso.predict(X_valid_scaled)
y_pred_ridge = model_lasso.predict(X_valid_scaled)


# Evaluasi model menggunakan MSE
mse_lasso = mean_squared_error(y_valid, y_pred_lasso)
mse_ridge = mean_squared_error(y_valid, y_pred_ridge)
print(f'Mean Squared Error Regresi Lasso: {mse_lasso}')
print(f'Mean Squared Error Regresi Ridge: {mse_ridge}')

# Inisialisasi model regresi Ridge
model_ridge = Ridge()


# Menetukan nilai alpha yang akan di uji
param_ridge = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Menginisalisasi GridSearchCV untuk mencari nilai alpha terbaik
grid_search = GridSearchCV(model_ridge, param_ridge, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)


# dapatkan model dengan alpha terbaik
best_model_ridge = grid_search.best_estimator_

# Melakukan prediksi pada set validasi
y_pred_ridge = best_model_ridge.predict(X_valid_scaled)

# Evaluasi model menggunakan MSE
mse_ridge = mean_squared_error(y_valid, y_pred_ridge)
print(f'Mean Squared Error Regresi Ridge setelah fine-tuning: {mse_ridge}')
print(f'Best alpha untuk Regresi Ridge: {best_model_ridge.alpha}')



# Scatter plot prediksi vs target aktual pada set validasi
plt.scatter(y_valid, y_pred_rf)
plt.title('Scatter Plot: Prediksi vs Target Aktual (Random Forest)')
plt.xlabel('Target Aktual')
plt.ylabel('Prediksi Model')
plt.show()


# Lakukan prediksi pada data uji (test set)
X_test_scaled = scaler.transform(X_test)
y_pred_test = best_model_rf.predict(X_test_scaled)

# Buat DataFrame untuk submission
submission_df = pd.DataFrame({'id': test_logs['id'], 'score': y_pred_test})

# Simpan DataFrame ke file CSV
submission_df.to_csv('submission.csv', index=False)
