In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
import os
from sys import platform

 Table

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
5. Load the `test.csv` file
6. Apply the same processing you did to `train.csv` into `test.csv`
7. `predict` the price for that file
8. Only keep the columns you need
9. Export
-----
10. Repeat! 🚀🔥

# Import the csv files

In [51]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [52]:
df_train.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
29142,29142,1.2,Ideal,G,SI1,61.8,59.0,6.77,6.8,4.19,8.819


In [53]:
df_test.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
11891,11891,1.01,Very Good,D,SI2,62.2,60.0,6.4,6.43,3.99


In [54]:
df_train.shape

(40455, 11)

In [55]:
df_test.shape

(13485, 10)

In [56]:
#SUBMISSION -> test

In [57]:
print(df_train.shape[0])
df_train.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
9418,9418,0.33,Premium,G,IF,59.4,59.0,4.49,4.54,2.68,6.749


In [58]:
print(df_test.shape[0])
df_test.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
9315,9315,0.71,Premium,F,SI1,62.2,58.0,5.74,5.67,3.55


# Cleaning, processing, feature selection, etc

In [59]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.select_dtypes(exclude='object')
print(df_train_cleaned.shape[0])
df_train_cleaned

40455


Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,1.02,63.2,58.0,6.36,6.40,4.03,8.928
1,1,0.35,61.0,57.0,4.54,4.57,2.77,6.477
2,2,0.31,60.5,58.0,4.43,4.40,2.67,6.810
3,3,0.38,61.4,56.0,4.66,4.69,2.87,6.824
4,4,1.64,61.8,56.0,7.59,7.60,4.69,9.776
...,...,...,...,...,...,...,...,...
40450,40450,1.20,62.2,55.0,6.77,6.81,4.23,9.149
40451,40451,1.50,64.2,56.0,7.30,7.09,4.62,9.077
40452,40452,1.06,61.9,55.0,6.54,6.58,4.06,8.892
40453,40453,0.31,60.1,58.0,4.40,4.38,2.64,6.385


# Train on train.csv

![](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/4_train-test-split.jpg)

## Train, test split

In [60]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

## Fit

In [61]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say I'm done training")

In [62]:
y_pred = regressor.predict(X_test)

In [63]:
np.sqrt(mean_squared_error(y_pred, y_test))

0.3706202430646625

-----
-----
-----
-----


# Applying same cleaning & processing to my `test.csv`

In [64]:
df_test_cleaned.sample()

Unnamed: 0,id,carat,depth,table,x,y,z,price
1000,1000,0.5,61.2,60.0,5.12,5.08,3.12,7.215815


In [65]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
8748,8748,0.3,63.0,55.0,4.29,4.28,2.7


# Predict on the `test.csv`

In [66]:
y_pred = regressor.predict(df_test_cleaned)
y_pred

# Just for feedback
if platform == "darwin":
    os.system("say -v ayam don predictin")

In [67]:
df_test_cleaned.sample

<bound method NDFrame.sample of           id  carat  depth  table     x     y     z
0          0   0.81   61.5   57.0  6.01  6.06  3.71
1          1   0.50   63.8   58.0  5.08  4.97  3.21
2          2   0.31   60.1   56.0  4.43  4.46  2.67
3          3   1.52   64.7   58.0  7.19  7.22  4.66
4          4   0.35   60.8   58.0  4.55  4.53  2.76
...      ...    ...    ...    ...   ...   ...   ...
13480  13480   0.52   61.4   56.0  5.23  5.20  3.20
13481  13481   1.01   59.3   59.0  6.50  6.56  3.87
13482  13482   1.50   60.6   61.0  7.34  7.31  4.44
13483  13483   0.40   62.5   54.0  4.75  4.76  2.97
13484  13484   2.01   61.6   61.0  8.14  8.07  4.99

[13485 rows x 7 columns]>

# DF with two columns

In [68]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
8456,8456,0.59,61.1,57.0,5.43,5.4,3.31


In [69]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [70]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
2045,2045,8.756878


# Export (index=False)

In [71]:
df_for_submission.to_csv("my_submission.csv", index=False)

# Just for feedback
if platform == "darwin":
    os.system("say redi for submission")

In [72]:
df_for_submission.head()

Unnamed: 0,id,price
0,0,8.184137
1,1,7.195899
2,2,6.560437
3,3,9.081303
4,4,6.637876
