## Importing the essential libraries over here

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso


## Importing the name of the datasets over here present in the seaborn library

In [23]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

## Importing the flights dataset present in seaborn library over here

In [24]:
data=sns.load_dataset("flights")

In [25]:
data.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [26]:
data['year'].dtype

dtype('int64')

In [27]:
data.shape

(144, 3)

## Changing the datatype of year into string over here

In [28]:
data['year']=data['year'].astype(str)

In [29]:
data['year'].value_counts()

year
1949    12
1950    12
1951    12
1952    12
1953    12
1954    12
1955    12
1956    12
1957    12
1958    12
1959    12
1960    12
Name: count, dtype: int64

In [30]:
data['month'].value_counts()

month
Jan    12
Feb    12
Mar    12
Apr    12
May    12
Jun    12
Jul    12
Aug    12
Sep    12
Oct    12
Nov    12
Dec    12
Name: count, dtype: int64

## Encoding the categorical features into numerical representations using mapping over here

### Performing encoding for month feature over here

In [31]:
month_mapping={"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11}

In [32]:
data['month']=data['month'].map(month_mapping)

### Performing encoding for year feature over here

In [33]:
year_mapping={"1949":0,"1950":1,"1951":2,"1952":3,"1953":4,"1954":5,"1955":6,"1956":7,"1957":8,"1958":9,"1959":10,"1960":11}


In [34]:
data['year']=data['year'].map(year_mapping)

In [35]:
data

Unnamed: 0,year,month,passengers
0,0,0,112
1,0,1,118
2,0,2,132
3,0,3,129
4,0,4,121
...,...,...,...
139,11,7,606
140,11,8,508
141,11,9,461
142,11,10,390


## Creating the features and labels over here

In [36]:
X=data.iloc[:,:-1].values # features over here
y=data.iloc[:,-1].values # labels over here

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [37]:
# splitting the dataset into training set and testing set over here
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model (Experimenting with different machine learning algorithms in regression analysis and selecting the one which will give more accuracy to us with respect to dependent variable )

In [64]:
from sklearn.ensemble import GradientBoostingRegressor

# regressor=LinearRegression()
# regressor=KNeighborsRegressor() # giving 83% accuracy over here
regressor=KNeighborsRegressor()


regressor.fit(X_train,y_train)

In [65]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[148.  153. ]
 [374.  336. ]
 [301.  315. ]
 [178.  165.6]
 [362.  364.8]
 [463.  467. ]
 [201.  195.6]
 [114.  128.4]
 [407.  376.4]
 [125.  142.4]
 [548.  528. ]
 [360.  361.6]
 [183.  195.6]
 [191.  184.4]
 [264.  262.2]
 [162.  166. ]
 [145.  134.8]
 [136.  136.8]
 [559.  534.4]
 [235.  214.6]
 [317.  289.2]
 [404.  416.4]
 [180.  189.2]
 [315.  315. ]
 [242.  218.8]
 [491.  468.8]
 [227.  240.6]
 [508.  488.2]
 [422.  406.2]]


In [66]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9847939782160603