# **Packages**

In [21]:
import pandas as pd
import seaborn as sns
from sklearn import preprocessing, svm
from sklearn.linear_model import RANSACRegressor, LinearRegression

# **Import files**

In [22]:
x_te_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/x_te.csv'
x_te = pd.read_csv(x_te_url)
x_tr_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/x_tr.csv'
x_tr = pd.read_csv(x_tr_url)
y_te_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/y_te.csv'
y_te = pd.read_csv(y_te_url)
y_tr_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/y_tr.csv'
y_tr = pd.read_csv(y_tr_url)

te_final_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/te_final.csv'
te_final = pd.read_csv(te_final_url)

sample_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/sampleSubmission.csv'
sample = pd.read_csv(sample_url)

# **Conduct RANSAC regression**

Find the highest score by testing different hyperparameters

In [23]:
min_samples = [50, 100, 1000, 10000, 20000]
max_trails = [1, 10, 50, 100, 200]
losses = ['absolute_error', 'squared_error']
score_highest = 0.0
parameter_best = [0.0, 0.0, 0.0, 0.0]

for min_sample in min_samples:
  for max_trail in max_trails:
    for loss in losses:
      RSC = RANSACRegressor(estimator=LinearRegression(),
                            min_samples=min_sample, max_trials=max_trail,
                            loss=loss, random_state=42,
                            residual_threshold=10)
      RSC.fit(x_tr, y_tr)
      score = RSC.score(x_te, y_te)
      print('Min sample:\t', min_sample, '\nMax trail:\t', max_trail,
            '\nLoss:\t\t', loss, '\nScore:\t\t', score)
      if score > score_highest:
        score_highest = score
        parameter_best = [min_sample, max_trail, loss, score]
      print('---')

Min sample:	 50 
Max trail:	 1 
Loss:		 absolute_error 
Score:		 -819.621688845142
---
Min sample:	 50 
Max trail:	 1 
Loss:		 squared_error 
Score:		 -1383.0307668742587
---
Min sample:	 50 
Max trail:	 10 
Loss:		 absolute_error 
Score:		 -1.8861454489218619
---
Min sample:	 50 
Max trail:	 10 
Loss:		 squared_error 
Score:		 -2.258209219642095
---
Min sample:	 50 
Max trail:	 50 
Loss:		 absolute_error 
Score:		 -0.473612101131458
---
Min sample:	 50 
Max trail:	 50 
Loss:		 squared_error 
Score:		 -1.2021105343784906
---
Min sample:	 50 
Max trail:	 100 
Loss:		 absolute_error 
Score:		 -8.264000615750497
---
Min sample:	 50 
Max trail:	 100 
Loss:		 squared_error 
Score:		 -12.462951138831079
---
Min sample:	 50 
Max trail:	 200 
Loss:		 absolute_error 
Score:		 -8.264000615750497
---
Min sample:	 50 
Max trail:	 200 
Loss:		 squared_error 
Score:		 -0.3570552013245405
---
Min sample:	 100 
Max trail:	 1 
Loss:		 absolute_error 
Score:		 -123.65145246746997
---
Min sample:	 100 
M

# **Print the best hyper parameters**

In [24]:
print('Final best score:', '\nMin sample:\t', parameter_best[0],
      '\nMax trail:\t', parameter_best[1],
      '\nLoss:\t\t', parameter_best[2],
      '\nScore:\t\t', parameter_best[3])

Final best score: 
Min sample:	 20000 
Max trail:	 1 
Loss:		 squared_error 
Score:		 0.09189602282357823


# **Make the prediction**

In [25]:
RSC = RANSACRegressor(estimator=LinearRegression(),
                      min_samples=parameter_best[0],
                      max_trials=parameter_best[1],
                      loss=parameter_best[2],
                      random_state=42,
                      residual_threshold=10)
RSC.fit(x_tr, y_tr)

prediction_array = RSC.predict(te_final)
prediction = pd.DataFrame(prediction_array)
prediction.columns = ['Weekly_Sales_cbrt']
prediction

Unnamed: 0,Weekly_Sales_cbrt
0,20.023385
1,20.038143
2,20.052902
3,20.067661
4,20.082419
...,...
115059,19.117102
115060,19.131860
115061,19.146619
115062,19.176136


In [26]:
prediction['Weekly_Sales'] = prediction['Weekly_Sales_cbrt'].pow(3)
prediction

Unnamed: 0,Weekly_Sales_cbrt,Weekly_Sales
0,20.023385,8028.094756
1,20.038143,8045.859550
2,20.052902,8063.650531
3,20.067661,8081.467720
4,20.082419,8099.311134
...,...,...
115059,19.117102,6986.604482
115060,19.131860,7002.798121
115061,19.146619,7019.016763
115062,19.176136,7051.529135


In [27]:
sample['Weekly_Sales'] = prediction['Weekly_Sales']
sample

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,8028.094756
1,1_1_2012-11-09,8045.859550
2,1_1_2012-11-16,8063.650531
3,1_1_2012-11-23,8081.467720
4,1_1_2012-11-30,8099.311134
...,...,...
115059,45_98_2013-06-28,6986.604482
115060,45_98_2013-07-05,7002.798121
115061,45_98_2013-07-12,7019.016763
115062,45_98_2013-07-19,7051.529135


In [28]:
sample.to_csv('RANSAC_Result.csv')