In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import os
from sys import platform

 Table

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
5. Load the `test.csv` file
6. Apply the same processing you did to `train.csv` into `test.csv`
7. `predict` the price for that file
8. Only keep the columns you need
9. Export
-----
10. Repeat! 🚀🔥

# Import the csv files

In [70]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [71]:
df_train.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
38166,38166,0.52,Very Good,D,VS1,59.6,57.0,5.24,5.3,3.14,1995


In [72]:
df_test.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
11359,11359,0.31,Premium,H,VS1,62.9,58.0,4.34,4.31,2.72


In [73]:
df_train.shape

(40455, 11)

In [74]:
df_test.shape

(13485, 10)

In [75]:
#SUBMISSION -> test

In [76]:
print(df_train.shape[0])
df_train.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
37996,37996,1.01,Premium,D,SI2,61.7,57.0,6.49,6.44,3.99,5006


In [77]:
print(df_test.shape[0])
df_test.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
9639,9639,1.9,Very Good,J,SI1,59.2,58.0,8.1,8.21,4.83


# Cleaning, processing, feature selection, etc

In [78]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.select_dtypes(exclude='object')
print(df_train_cleaned.shape[0])
df_train_cleaned

40455


Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,0.60,63.1,54.0,5.41,5.33,3.39,1657
1,1,0.53,61.7,56.0,5.20,5.22,3.21,1630
2,2,0.34,62.3,60.0,4.46,4.49,2.79,417
3,3,0.30,61.6,56.0,4.33,4.37,2.68,912
4,4,1.72,60.5,59.0,7.80,7.71,4.69,7089
...,...,...,...,...,...,...,...,...
40450,40450,0.70,61.9,58.0,5.67,5.73,3.53,1843
40451,40451,0.90,62.5,56.0,6.11,6.21,3.85,3188
40452,40452,0.70,62.0,56.0,5.66,5.69,3.52,2699
40453,40453,0.61,62.4,57.0,5.44,5.46,3.40,2096


# Train on train.csv

![](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/4_train-test-split.jpg)

## Train, test split

In [79]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

## Fit

In [80]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say I'm done training")

sh: -c: line 0: unexpected EOF while looking for matching `''
sh: -c: line 1: syntax error: unexpected end of file


In [81]:
y_pred = regressor.predict(X_test)

In [82]:
np.sqrt(mean_squared_error(y_pred, y_test))

1594.1389779573806

-----
-----
-----
-----


# Applying same cleaning & processing to my `test.csv`

In [83]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
11633,11633,1.02,61.6,55.0,6.53,6.46,4.0


# Predict on the `test.csv`

In [84]:
y_pred = regressor.predict(df_test_cleaned)
y_pred

# Just for feedback
if platform == "darwin":
    os.system("say ayam don predictin")

# DF with two columns

In [85]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
10117,10117,1.51,59.5,61.0,7.45,7.62,4.48


In [86]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [87]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
4562,4562,1549.526547


# Export (index=False)

In [88]:
df_for_submission.to_csv("my_submission.csv", index=False)

# Just for feedback
if platform == "darwin":
    os.system("say redi for submission")

In [89]:
df_for_submission.head()

Unnamed: 0,id,price
0,0,3174.445133
1,1,4766.185214
2,2,5771.49423
3,3,787.415682
4,4,3875.300849
