## Single variable regression


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

np.random.seed(1)
X = np.arange(0,10)
Y = 20 + 2*X + np.random.randn(10)

plt.scatter(X, Y);

You can visually confirm that there is a linear trend here, so what's the best line that can fit?

In [None]:
lr = LinearRegression().fit(X.reshape(-1, 1), Y)
print("Y =", lr.coef_[0], 'X +', lr.predict([[0]])[0])

In [None]:
plt.scatter(X, Y);

# Testing with some new points
Xnew = np.arange(-1, 11, 0.5).reshape(-1, 1)
Ynew = lr.predict(Xnew)
plt.plot(Xnew, Ynew, color='g')

## Going to higher dimensions - Bike sharing

This dataset is available on [kaggle](https://www.kaggle.com/lakshmi25npathi/bike-sharing-dataset)

In [None]:
!mkdir data
!wget -O data/bikes.csv https://raw.githubusercontent.com/MJafarMashhadi/MachineLearningWorkshop/master/data/bikes.csv

In [None]:
df = pd.read_csv('data/bikes.csv')
features, target = df.drop(['cnt'], axis=1), df['cnt']

df

In [None]:
for col_name in ('season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'):
    df[col_name] = df[col_name].astype('category')

df.describe().drop(['count'])

### Splitting the data and training a regression model

In [None]:
scores = {'Training': [], 'Test': []}
for random_state in np.arange(10, 20):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=random_state)
    lr = LinearRegression().fit(X_train, y_train)

    scores['Training'].append(100 * lr.score(X_train, y_train))
    scores['Test'].append(100 * lr.score(X_test, y_test))

scores = pd.DataFrame(scores)
scores.describe().drop(['count'])