### Exercise 1.3: Build a Linear Regression model to predict Australia temperature 

#### 1.  Import necessary Python packages for the exercise

In [9]:
import pandas as pd
import sklearn  

#### 2.  Read the csv file into dataframe.

In [12]:
df =pd.read_csv("tas_1991_2016_AUS.csv")
df.head()

Unnamed: 0,Temperature - (Celsius),Year,Month,Country,ISO3
0,28.2684,1991,Jan,Australia,AUS
1,27.9415,1991,Feb,Australia,AUS
2,25.7541,1991,Mar,Australia,AUS
3,22.2463,1991,Apr,Australia,AUS
4,18.4007,1991,May,Australia,AUS


#### 3. Rename the columns of the DataFrame

In [14]:
df.columns = ['temperature', 'year', 'month', 'country', 'cty']
df.head()

Unnamed: 0,temperature,year,month,country,cty
0,28.2684,1991,Jan,Australia,AUS
1,27.9415,1991,Feb,Australia,AUS
2,25.7541,1991,Mar,Australia,AUS
3,22.2463,1991,Apr,Australia,AUS
4,18.4007,1991,May,Australia,AUS


#### 4. Drop columns, country and cty

In [16]:
copy_df = df.copy()

df =df.drop(['country', 'cty'], axis=1)
df.head()

Unnamed: 0,temperature,year,month
0,28.2684,1991,Jan
1,27.9415,1991,Feb
2,25.7541,1991,Mar
3,22.2463,1991,Apr
4,18.4007,1991,May


##### 5. Apply feature engineering to the dataframe
##### a.  Convert year column’s datatype to Category

In [17]:
year_cat = list(range(1991, 2017))   #<-- categorical intervaler
df['year'] = df['year'].astype('category', categories=year_cat, ordered=True).cat.codes

  


In [18]:
df.head()

Unnamed: 0,temperature,year,month
0,28.2684,0,Jan
1,27.9415,0,Feb
2,25.7541,0,Mar
3,22.2463,0,Apr
4,18.4007,0,May


In [21]:
for i in range(len(df.year.unique())):
    print(f'{df.year.unique()[i]} => {year_cat[i]}')

0 => 1991
1 => 1992
2 => 1993
3 => 1994
4 => 1995
5 => 1996
6 => 1997
7 => 1998
8 => 1999
9 => 2000
10 => 2001
11 => 2002
12 => 2003
13 => 2004
14 => 2005
15 => 2006
16 => 2007
17 => 2008
18 => 2009
19 => 2010
20 => 2011
21 => 2012
22 => 2013
23 => 2014
24 => 2015
25 => 2016


##### b.  Encode month column to number

In [22]:
from sklearn.preprocessing import LabelEncoder

dic = dict()
for colname in df.columns:
    if df[colname].dtype == 'object':
        dic[colname]=LabelEncoder()
        df[colname] = dic[colname].fit_transform(df[colname])

df.head()

Unnamed: 0,temperature,year,month
0,28.2684,0,4
1,27.9415,0,3
2,25.7541,0,7
3,22.2463,0,0
4,18.4007,0,8


#### 6. Split dataframe into training and test set

In [23]:
from sklearn.model_selection import train_test_split

X= df[['year', 'month']]
y= df['temperature']

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

#### 7. Use training set to train a regression model

In [24]:
from sklearn.linear_model import LinearRegression 

model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### 8. Use the model to predict temperature

In [27]:
predictions = model.predict(x_test)
predictions[:10]

array([22.02135137, 21.43104919, 21.29653558, 21.29444567, 22.2945584 ,
       22.55522597, 22.43116192, 21.30907505, 21.87847812, 21.28399612])

#### 8. Check performance of the model

In [28]:
from sklearn.metrics import mean_absolute_error

mae= mean_absolute_error(y_test, predictions)
mae

4.041190958541058

#### 9. Save the model

In [29]:
from sklearn.externals import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']