### TRAIN / TEST SPLIT

A train/test split is a fundamental step in machine learning that helps evaluate model performance. It involves dividing a dataset into a training set, used to teach the model, and a test set, used to assess its accuracy on unseen data. This ensures the model generalizes well and avoids overfitting.

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
import os
from sys import platform

In [40]:
df_train = pd.read_csv(r"D:\train.csv")

In [42]:
df_test = pd.read_csv(r"D:\test.csv")

In [44]:
df_train.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
6653,6653,1.2,Ideal,G,SI1,62.2,58.0,6.77,6.84,4.23,8.816


In [46]:
df_test.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
8269,8269,0.7,Ideal,D,VS2,62.7,57.0,5.65,5.67,3.55


In [48]:
df_test.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [50]:
df_train.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [52]:
df_train.shape

(40455, 11)

In [54]:
df_test.shape

(13485, 10)

### TRAIN DATASET

##### WE CAN DROP CATEGORICAL COLUMNS

In [56]:
df_train_cleaned = df_train.select_dtypes(exclude='object')
df_train_cleaned

Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,0.30,60.2,56.0,4.36,4.41,2.64,6.163
1,1,0.51,61.5,58.0,5.14,5.11,3.15,7.373
2,2,1.59,62.1,58.0,7.45,7.43,4.62,9.305
3,3,0.70,62.6,57.0,5.63,5.68,3.54,7.990
4,4,1.01,63.1,59.0,6.31,6.34,3.99,9.265
...,...,...,...,...,...,...,...,...
40450,40450,0.50,61.9,58.0,5.12,5.09,3.16,6.914
40451,40451,1.07,62.2,57.0,6.59,6.56,4.09,8.576
40452,40452,1.42,62.7,55.0,7.11,7.17,4.48,9.346
40453,40453,0.42,61.3,56.0,4.81,4.82,2.95,6.938


##### TRAIN

In [64]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

##### FIT

In [68]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

##### PREDICT

In [70]:
y_pred = regressor.predict(X_test)

In [72]:
np.sqrt(mean_squared_error(y_pred, y_test))

0.27814264773491304

### TEST DATASET

In [75]:
df_test.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
12530,12530,0.5,Very Good,F,SI2,62.9,57.0,5.02,5.06,3.17


##### CLEAN

In [77]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
df_test_cleaned.sample()

Unnamed: 0,id,carat,depth,table,x,y,z
4977,4977,0.3,60.4,61.0,4.31,4.27,2.59


##### PREDICT

In [82]:
y_pred = regressor.predict(df_test_cleaned)
y_pred

array([7.26321538, 6.78873118, 7.83241893, ..., 6.56374171, 9.88746867,
       7.3402639 ])

### DF with 2 columns

In [85]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
10028,10028,0.7,61.6,57.0,5.7,5.73,3.53


In [87]:
df_test_cleaned['price'] = y_pred 
df_for_submission = df_test_cleaned[["id", "price"]]

In [89]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
4494,4494,8.581103


### EXPORT

In [None]:
df_for_submission.to_csv("my_submission.csv", index=False)