# **Packages**

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing, svm
from sklearn.cross_decomposition import PLSRegression

# **Import files**

In [20]:
x_te_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/x_te.csv'
x_te = pd.read_csv(x_te_url)
x_tr_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/x_tr.csv'
x_tr = pd.read_csv(x_tr_url)
y_te_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/y_te.csv'
y_te = pd.read_csv(y_te_url)
y_tr_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/y_tr.csv'
y_tr = pd.read_csv(y_tr_url)

te_final_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/te_final.csv'
te_final = pd.read_csv(te_final_url)

sample_url = 'https://raw.githubusercontent.com/MingchengHe/4AI3_Project/main/sampleSubmission.csv'
sample = pd.read_csv(sample_url)

# **Conduct PLS regression**

Find the highest score by testing different hyperparameters

In [21]:
n_components = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
max_iters = [500, 1000, 2500, 5000, 10000]
tols = [1e-6, 2e-5, 1e-5]
score_highest = 0.0

for n_component in n_components:
  for max_iter in max_iters:
    for tol in tols:
      PLS = PLSRegression(n_components=n_component, max_iter=max_iter, tol=tol)
      PLS.fit(x_tr, y_tr)
      score = PLS.score(x_te, y_te)
      print('n_component:\t', n_component, '\nmax_iter:\t', max_iter,
            '\ntol:\t', tol, '\nScore:\t', score)
      if score > score_highest:
        score_highest = score
        parameter_best = [n_component, max_iter, tol, score]
      print('---')

n_component:	 5 
max_iter:	 500 
tol:	 1e-06 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 500 
tol:	 2e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 500 
tol:	 1e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 1000 
tol:	 1e-06 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 1000 
tol:	 2e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 1000 
tol:	 1e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 2500 
tol:	 1e-06 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 2500 
tol:	 2e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 2500 
tol:	 1e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 5000 
tol:	 1e-06 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 5000 
tol:	 2e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 5000 
tol:	 1e-05 
Score:	 0.09277325829836403
---
n_component:	 5 
max_iter:	 10000 
tol:	 1e-06 
Score:	

# **Print the best hyper parameters**

In [22]:
print('Final best score:', '\nn_component:\t', parameter_best[0],
      '\nmax_iter:\t', parameter_best[1],
      '\ntol:\t', parameter_best[2],
      '\nScore:\t', parameter_best[3])

Final best score: 
n_component:	 11 
max_iter:	 500 
tol:	 1e-06 
Score:	 0.09281809728083357


# **Make the prediction**

In [23]:
PLS = PLSRegression(n_components = parameter_best[0],
                    max_iter = parameter_best[1],
                    tol = tol)
PLS.fit(x_tr, y_tr)

prediction_array = PLS.predict(te_final)
prediction = pd.DataFrame(prediction_array)
prediction.columns = ['Weekly_Sales_cbrt']
prediction

Unnamed: 0,Weekly_Sales_cbrt
0,20.366997
1,20.377515
2,20.388033
3,20.398550
4,20.409068
...,...
115059,19.142103
115060,19.152621
115061,19.163139
115062,19.184175


In [24]:
prediction['Weekly_Sales'] = prediction['Weekly_Sales_cbrt'].pow(3)
prediction

Unnamed: 0,Weekly_Sales_cbrt,Weekly_Sales
0,20.366997,8448.526846
1,20.377515,8461.622509
2,20.388033,8474.731697
3,20.398550,8487.854419
4,20.409068,8500.990679
...,...,...
115059,19.142103,7014.051550
115060,19.152621,7025.619786
115061,19.163139,7037.200735
115062,19.184175,7060.400799


In [25]:
sample['Weekly_Sales'] = prediction['Weekly_Sales']
sample

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,8448.526846
1,1_1_2012-11-09,8461.622509
2,1_1_2012-11-16,8474.731697
3,1_1_2012-11-23,8487.854419
4,1_1_2012-11-30,8500.990679
...,...,...
115059,45_98_2013-06-28,7014.051550
115060,45_98_2013-07-05,7025.619786
115061,45_98_2013-07-12,7037.200735
115062,45_98_2013-07-19,7060.400799


In [26]:
sample.to_csv('PLS_Result.csv')