# TINO IQ ASSIGNMENT

### <span style="color:orange">AUTHOR:  <br><br>**Manasi Chhibber**</span>

### <span style="color:orange">AIM:  <br><br>To predict value of variable (target) for next 6 quarters using Pycaret package.</span>

### <span style="color:orange">THEORY:  <br><br>PyCaret is an open source ML library that eases the process of preparing the data to deploying the model. Here, as the 'Target' attribute has continuous type values, hence we'll use PyCaret Regression technique.</span>

### Importing Libraries

In [1]:
import pandas as pd                   # for reading the csv file
from pycaret.regression import *      # for setting up a regression model
import matplotlib.pyplot as plt       # for plotting graphs
import time                           # to keep record of training/testing time
from sklearn.metrics import r2_score  # for checking accuracy

### Getting our Data

In [2]:
df = pd.read_csv('fred_quarterly.csv')
df

Unnamed: 0,date,month,quarter,Target,gnp,gdpdef,gdp
0,1/1/2000,2000-01,2000Q1,12935.252,10035.263,77.325,10002.179
1,4/1/2000,2000-04,2000Q2,13170.749,10283.391,77.807,10247.72
2,7/1/2000,2000-07,2000Q3,13183.89,10348.645,78.263,10318.165
3,10/1/2000,2000-10,2000Q4,13262.25,10490.43,78.688,10435.744
4,1/1/2001,2001-01,2001Q1,13219.251,10510.297,79.204,10470.231
5,4/1/2001,2001-04,2001Q2,13301.394,10647.427,79.683,10599.0
6,7/1/2001,2001-07,2001Q3,13248.142,10623.039,79.996,10598.02
7,10/1/2001,2001-10,2001Q4,13284.881,10748.408,80.245,10660.465
8,1/1/2002,2002-01,2002Q1,13394.91,10833.143,80.504,10783.5
9,4/1/2002,2002-04,2002Q2,13477.356,10918.384,80.783,10887.46


### <span style="color:orange">PyCaret can independently handle the task of preprocessing the data, so we can directly setup the model and see what all processing it does.</span>

### Splitting Data for Training & Testing

In [3]:
train = df.iloc[:75,:]                # leaving out last 10 rows for testing
train

Unnamed: 0,date,month,quarter,Target,gnp,gdpdef,gdp
0,1/1/2000,2000-01,2000Q1,12935.252,10035.263,77.325,10002.179
1,4/1/2000,2000-04,2000Q2,13170.749,10283.391,77.807,10247.72
2,7/1/2000,2000-07,2000Q3,13183.89,10348.645,78.263,10318.165
3,10/1/2000,2000-10,2000Q4,13262.25,10490.43,78.688,10435.744
4,1/1/2001,2001-01,2001Q1,13219.251,10510.297,79.204,10470.231
5,4/1/2001,2001-04,2001Q2,13301.394,10647.427,79.683,10599.0
6,7/1/2001,2001-07,2001Q3,13248.142,10623.039,79.996,10598.02
7,10/1/2001,2001-10,2001Q4,13284.881,10748.408,80.245,10660.465
8,1/1/2002,2002-01,2002Q1,13394.91,10833.143,80.504,10783.5
9,4/1/2002,2002-04,2002Q2,13477.356,10918.384,80.783,10887.46


In [4]:
test = df.iloc[75:,:]
test

Unnamed: 0,date,month,quarter,Target,gnp,gdpdef,gdp
75,10/1/2018,2018-10,2018Q4,18721.281,21101.077,111.175,20813.325
76,1/1/2019,2019-01,2019Q1,18833.195,21254.334,111.514,21001.591
77,4/1/2019,2019-04,2019Q2,18982.528,21564.924,112.152,21289.268
78,7/1/2019,2019-07,2019Q3,19112.653,21780.753,112.517,21505.012
79,10/1/2019,2019-10,2019Q4,19202.31,21955.98,112.978,21694.458
80,1/1/2020,2020-01,2020Q1,18951.992,21721.267,113.346,21481.367
81,4/1/2020,2020-04,2020Q2,17258.205,19649.442,112.859,19477.444
82,7/1/2020,2020-07,2020Q3,18560.774,21365.412,113.888,21138.574
83,10/1/2020,2020-10,2020Q4,18767.778,21728.223,114.439,21477.597
84,1/1/2021,2021-01,2021Q1,19055.655,22273.06,115.652,22038.226


### Model Setup

In [5]:
model = setup(data = train, 
             target = 'Target',                                   # setting the predictor variable
             numeric_imputation = 'mean',                         # to replace null values with mean of that attribute
             numeric_features = ['gnp','gdpdef','gdp'],           # listing all numerical type attributes
             categorical_features = ['quarter','date','month'],   # listing all categorical type attributes
             normalize = True,                                    # we don't want skewed data
             silent = True)

Unnamed: 0,Description,Value
0,session_id,5060
1,Target,Target
2,Original Data,"(75, 7)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(52, 44)"


### <span style="color:orange">As it can be seen here, data was normalized and perfect collinear attributes were removed.</span>

### Model Comparison

In [6]:
# comparing different models
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,61.103,7443.7312,76.8876,0.9919,0.005,0.004,0.04
catboost,CatBoost Regressor,95.7154,12886.8335,108.2138,0.9852,0.007,0.0061,0.241
lasso,Lasso Regression,95.3145,14554.85,116.9333,0.9841,0.008,0.0064,0.006
gbr,Gradient Boosting Regressor,101.5169,14507.4546,112.6198,0.9814,0.0072,0.0065,0.01
rf,Random Forest Regressor,99.6729,15553.8345,117.1695,0.9811,0.0076,0.0064,0.05
llar,Lasso Least Angle Regression,121.8838,20201.7385,137.1118,0.9794,0.0091,0.008,0.005
omp,Orthogonal Matching Pursuit,112.9441,18901.0938,132.5605,0.9793,0.0088,0.0074,0.005
br,Bayesian Ridge,133.5243,23027.9159,147.1814,0.9773,0.0097,0.0087,0.005
ridge,Ridge Regression,163.6924,32795.0944,175.901,0.9697,0.0114,0.0106,0.004
huber,Huber Regressor,163.5109,37083.5063,185.2315,0.9642,0.0123,0.0108,0.013


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=5060, verbose=0, warm_start=False)

### <span style="color:orange">Extra Trees Regressor and CatBoost Regressor being tree based, will divide the results into categories (there maybe a lot many different results, but still they will be categories only, based on conditions that are checked at each and every level of a tree). Rather than 'calculating' the 'Target', it would be classified into a group. Hence, skipping these 2 and going for Lasso Regression.</span>

### <span style="color:orange">LASSO REGRESSION:  <br><br> Lasso regression is a regularization technique. It is used over regression methods for a more accurate prediction. This model uses shrinkage. Shrinkage is where data values are shrunk towards a central point as the mean. </span>

<img src = 'https://miro.medium.com/max/1313/1*n-fid04EiwcTZZPO0RXlMQ.gif'>    

### Model Building

In [7]:
start = time.time()                              # to calculate the training time
model = create_model('lasso')
stop = time.time()
print(f"Training time: {stop - start}s")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,72.1877,7679.2168,87.6311,0.9944,0.0065,0.0053
1,99.632,15041.3174,122.6431,0.996,0.0091,0.0073
2,157.4932,31998.4492,178.8811,0.936,0.0118,0.0102
3,71.1238,6642.6377,81.5024,0.9975,0.0056,0.0049
4,107.8207,21586.7148,146.9242,0.9673,0.0096,0.007
5,108.2504,15348.0859,123.8874,0.9873,0.0086,0.0074
6,73.4252,7951.5781,89.1716,0.9955,0.0057,0.0047
7,63.277,7468.5259,86.4206,0.978,0.0055,0.0039
8,86.0207,15311.9844,123.7416,0.9944,0.0084,0.0057
9,113.9148,16519.9902,128.5301,0.9942,0.0087,0.0076


Training time: 0.16577696800231934s


In [8]:
# tuning the parameters is good
tuned_model = tune_model(model)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,106.9352,12554.8643,112.0485,0.9908,0.0082,0.0077
1,160.6737,29558.0176,171.9245,0.9922,0.0125,0.0114
2,155.3129,29226.9746,170.959,0.9415,0.0112,0.01
3,131.959,20608.9883,143.5583,0.9922,0.0097,0.0089
4,133.9984,23251.7188,152.4851,0.9647,0.0099,0.0086
5,156.8896,27946.207,167.1712,0.9769,0.0115,0.0107
6,121.9561,16253.6299,127.4897,0.9908,0.0082,0.0078
7,61.2344,7412.8662,86.098,0.9782,0.0054,0.0038
8,136.0824,20559.457,143.3857,0.9924,0.0089,0.0083
9,179.3193,32993.3984,181.6409,0.9885,0.0121,0.0118


In [9]:
# our model is ready!

### <span style="color:orange">Using Lasso Regression, an accuracy till 99.24% can be reached.</span>

### Making Predictions on Training Set

In [10]:
train_1 = train.drop(['Target'], axis=1)                                                         # dropping the 'Train' attribute so that predictions could be made
train_predictions = predict_model(tuned_model, data = train_1)                                   # making the predictions
train_predictions = train_predictions.rename(columns = {'Label':'Target'})                       # renaming the resulting column as our 'Target'
print(f"Train accuracy: {round(r2_score(train['Target'], train_predictions['Target'])*100,2)}%") # using R2 score to compute accuracy
train_predictions

Train accuracy: 99.16%


Unnamed: 0,date,month,quarter,gnp,gdpdef,gdp,Target
0,1/1/2000,2000-01,2000Q1,10035.263,77.325,10002.179,13163.648438
1,4/1/2000,2000-04,2000Q2,10283.391,77.807,10247.720,13287.698242
2,7/1/2000,2000-07,2000Q3,10348.645,78.263,10318.165,13323.288086
3,10/1/2000,2000-10,2000Q4,10490.430,78.688,10435.744,13382.690430
4,1/1/2001,2001-01,2001Q1,10510.297,79.204,10470.231,13400.114258
...,...,...,...,...,...,...,...
70,7/1/2017,2017-07,2017Q3,19864.059,107.903,19558.693,17991.722656
71,10/1/2017,2017-10,2017Q4,20219.956,108.670,19882.965,18155.546875
72,1/1/2018,2018-01,2018Q1,20460.314,109.261,20143.716,18287.283203
73,4/1/2018,2018-04,2018Q2,20793.247,110.234,20492.492,18463.488281


### Making Predictions on Test Set

In [11]:
test_1 = test.drop(['Target'], axis=1)                        # dropping the 'Train' attribute so that predictions could be made
start = time.time()                                           # to calculate the testing time
test_predictions = predict_model(tuned_model, data = test_1)  # making predictions
stop = time.time()
print(f"Testing time: {stop - start}s")

test_predictions = test_predictions.rename(columns = {'Label':'Target'})  # renaming the resulting column as our 'Target'

error = 0                                                     # counter for error calculation; initially set 0
expected = list(test['Target'])                               # making lists for expected & predicted results
predicted = list(test_predictions['Target'])
for i in range(11):
    # finding the absolute error for each value (or iteration), summing it up 
    # and multiplying it by 100 to compute the percentage error
    error += abs((abs((expected[i] - predicted[i])))/expected[i])*100   
print(f"Mean absolute error: {round(error,2)}%")              # printing out that error percentage to 2 decimal places       
print(f"Accuracy: {100-round(error,2)}%")                     # 100 - (percentage of error) will give us the percentage of accuracy     
test_predictions

Testing time: 0.0722966194152832s
Mean absolute error: 11.65%
Accuracy: 88.35%


Unnamed: 0,date,month,quarter,gnp,gdpdef,gdp,Target
75,10/1/2018,2018-10,2018Q4,21101.077,111.175,20813.325,18625.576172
76,1/1/2019,2019-01,2019Q1,21254.334,111.514,21001.591,18720.691406
77,4/1/2019,2019-04,2019Q2,21564.924,112.152,21289.268,18866.029297
78,7/1/2019,2019-07,2019Q3,21780.753,112.517,21505.012,18975.027344
79,10/1/2019,2019-10,2019Q4,21955.98,112.978,21694.458,19070.736328
80,1/1/2020,2020-01,2020Q1,21721.267,113.346,21481.367,18963.082031
81,4/1/2020,2020-04,2020Q2,19649.442,112.859,19477.444,17950.673828
82,7/1/2020,2020-07,2020Q3,21365.412,113.888,21138.574,18789.898438
83,10/1/2020,2020-10,2020Q4,21728.223,114.439,21477.597,18961.175781
84,1/1/2021,2021-01,2021Q1,22273.06,115.652,22038.226,19244.414062


### Making Predictions for the next 6 Quarters

In [12]:
# but first, let's create a sample set for the next 6 quarters
data = [['7/1/2021', '2021-07', '2021Q3', 23768.420, 118.575, 23498.231],  # quarter-1
        ['10/1/2021', '2021-10', '2021Q4', 22832.897, 116.235, 22143.997], # quarter-2
        ['1/1/2022', '2022-01', '2022Q1', 21435.112, 115.789, 18765.309],  # quarter-3
        ['4/1/2022', '2022-04', '2022Q2', 24310.577, 119.690, 20320.005],  # quarter-4
        ['7/1/2022', '2022-07', '2022Q3', 23994.312, 117.719, 21845.269],  # quarter-5
        ['10/1/2022', '2022-10', '2022Q4', 22984.351, 118.748, 19118.800]] # quarter-6
 
# creating the pandas DataFrame
final_test = pd.DataFrame(data, columns = ['date','month','quarter','gnp','gdpdef','gdp'])
final_test

Unnamed: 0,date,month,quarter,gnp,gdpdef,gdp
0,7/1/2021,2021-07,2021Q3,23768.42,118.575,23498.231
1,10/1/2021,2021-10,2021Q4,22832.897,116.235,22143.997
2,1/1/2022,2022-01,2022Q1,21435.112,115.789,18765.309
3,4/1/2022,2022-04,2022Q2,24310.577,119.69,20320.005
4,7/1/2022,2022-07,2022Q3,23994.312,117.719,21845.269
5,10/1/2022,2022-10,2022Q4,22984.351,118.748,19118.8


In [13]:
# making the predictions on the sample set
predictions = predict_model(tuned_model, data = final_test)
predictions = predictions.rename(columns = {'Label':'Target'})
predictions

Unnamed: 0,date,month,quarter,gnp,gdpdef,gdp,Target
0,7/1/2021,2021-07,2021Q3,23768.42,118.575,23498.231,19982.027344
1,10/1/2021,2021-10,2021Q4,22832.897,116.235,22143.997,19297.849609
2,1/1/2022,2022-01,2022Q1,21435.112,115.789,18765.309,17590.894531
3,4/1/2022,2022-04,2022Q2,24310.577,119.69,20320.005,18376.345703
4,7/1/2022,2022-07,2022Q3,23994.312,117.719,21845.269,19146.929688
5,10/1/2022,2022-10,2022Q4,22984.351,118.748,19118.8,17769.482422


### <span style="color:orange">The 'Target' attribute shows the final predictions.</span>

### Saving the Predictions in a csv file

In [14]:
predictions.to_csv('ManasiChhibber_TargetPredictions.csv')