In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgbm
from lightgbm import LGBMRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s3e25/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s3e25/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
sns.histplot(data=train_df, x='Hardness', bins=100, kde=False)  # You can adjust the number of bins as needed
plt.title('Histogram of Column')

In [None]:
train_df["Hardness"].unique()

In [None]:
correlations = train_df.drop(columns=['id']).corr()
print(correlations["Hardness"])

In [None]:
%matplotlib inline

plt.figure(figsize=(12,8))
sns.heatmap(correlations, annot=True)
plt.show()

In [None]:
sns.pairplot(data=train_df.drop(columns=['id']), diag_kind='kde')

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled_train_df = train_df.drop('id', axis=1)
scaled_train_df[scaled_train_df.columns] = sc.fit_transform(scaled_train_df)
scaled_train_df.head()

In [None]:
#from sklearn.feature_selection import SelectKBest, r_regression
#X_new = SelectKBest(r_regression, k=5).fit_transform(X, y)
#pd.DataFrame(X_new).head()

In [None]:
plt.figure(figsize=(14, 14)) 
sns.boxplot(data=scaled_train_df)
sns.stripplot(data=scaled_train_df, color='blue', size=1, jitter=False)
plt.xticks(rotation=45)

In [None]:
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import median_absolute_error, r2_score

X = scaled_train_df.drop(['Hardness'], axis=1)
y = train_df['Hardness']

X_sub = test_df.drop(['id'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LGBMRegressor(objective="mae", random_state=42, verbose=-1)
model.fit(X_train, y_train)

prediction = model.predict(X_test)
mae = median_absolute_error(y_test, prediction)
print(mae)
r2 = r2_score(y_test, prediction)
print(r2)

Now lets try remove outliers and see if this can be improved.

In [None]:
threshold = 1.5
no_outliers_df = scaled_train_df
for column in scaled_train_df.drop("Hardness", axis=1).columns:
    Q1 = no_outliers_df[column].quantile(0.25)
    Q3 = no_outliers_df[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = no_outliers_df[(scaled_train_df[column] < Q1 - threshold * IQR) | (no_outliers_df[column] > Q3 + threshold * IQR)]
    #no_outliers_df.loc[outliers.index, column] = scaled_train_df[column].mean()
    no_outliers_df = no_outliers_df.drop(outliers.index)

In [None]:
correlations = no_outliers_df.corr()
print(correlations["Hardness"])

In [None]:
features = ["density_Total", "allelectrons_Average", "val_e_Average", "el_neg_chi_Average", "R_cov_element_Average","ionenergy_Average", "zaratio_Average"]

In [None]:
plt.figure(figsize=(14, 14)) 
sns.boxplot(data=no_outliers_df.drop("Hardness", axis=1))
sns.stripplot(data=no_outliers_df.drop("Hardness", axis=1), color='blue', size=1, jitter=True)
plt.xticks(rotation=45)

In [None]:
X = df.drop("Hardness", axis=1)
#X = X[features]
y = train_df["Hardness"]#.drop(outliers.index)
print(X.shape[0])
print(y.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model2 = LGBMRegressor(objective="mae", random_state=42, verbose=-1)
model2.fit(X_train, y_train)

prediction = model.predict(X_test)
mae = median_absolute_error(y_test, prediction)
print(mae)
r2 = r2_score(y_test, prediction)
print(r2)

Make submission predictions

In [None]:
sub_pred = model.predict(sc.fit_transform(X_sub))

sub_df = pd.DataFrame(test_df['id'])
sub_df['Hardness'] = pd.DataFrame(sub_pred)[0]
sub_df.head(10)

In [None]:
sub_df.to_csv('submission.csv', index = False)