In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import mean_absolute_percentage_error
# from sklearn.metrics import mean_absolute_percentage_error ### over ver 0.24
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [None]:
### open co2.csv file and get header from 55th row of the file and get data from 58th row of the file
df = pd.read_csv('co2.csv', header=1, skiprows=53)
# df.head()

### delete first and second row of the dataframes
df = df.iloc[2:]
# df.head()

### reset index of the dataframe to get rid of old index
df = df.reset_index(drop=True)
# df.head()
df['Mn_int'] = df[' Mn'].astype(int)
df['CO2_float'] = df['     CO2'].astype(float)
# df.head()
df['t_i'] = (df['Mn_int'] + 0.5)/12
df_processed = df[df['CO2_float'] != -99.99]
df_processed.head()
# df.columns




In [None]:
### dividing test and test data in the ratio of 80:20 with
### train_test_split function
X = df_processed[['t_i']].values
y = df_processed['CO2_float'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)


In [None]:
reg = LinearRegression()
# X = df_processed['t_i'].values.reshape(-1,1)
# y = df_processed['CO2_float'].values
reg.fit(X_train, y_train)
# reg.coef_
# reg.intercept_

In [None]:
reg.coef_
# reg.intercept_

In [None]:
### make predictions
y_pred = reg.predict(X_test)

### calculate residual and make a plot
residual = y_test - y_pred
plt.plot(df['Mn_int'][:len(residual)], residual, 'o', color = 'red')

# prediction = reg.predict(x_train)
# residual = (y_train - prediction)
# # print(residual)

# plt.plot(x_train, residual, 'o', color='red');

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
print('MSE: ', mean_squared_error(y_test, y_pred, squared = False))
print('MAE: ', mean_absolute_percentage_error(y_test, y_pred))

In [None]:

### Transforming the dataframe to float64
df_new = df_new.astype(float)
df_new.head()

### If the row in df_new contains -99.99, then drop the row with -99.99
df_new_drop = df_new.drop(df_new[df_new == -99.99].index)
df_new_drop.head()

In [None]:
### Making df_new_drop as numpy array with index
arr_values = df_new_drop.values

### Making a new array from index of df_new_drop
arr_index = df_new_drop.index
arr_index = arr_index.to_numpy()
arr_index = (arr_index+0.5)/12
# print(arr_index)

print(arr_values.shape)
print(arr_index.shape)
print(type(arr_values))
print(type(arr_index))

# arr_CO2 = np.vstack((arr_index, arr_values))
# arr_CO2 = arr_CO2.T
# print(arr_CO2)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(arr_index, arr_values, test_size=0.2)
x_train = x_train.reshape(-1, 1)
x_test = x_test.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
### Making Simple Linear Regression Model with x_train and y_train
reg = LinearRegression()
reg.fit(x_train, y_train)


In [None]:
print('Intercept: ', reg.intercept_)
print('Coef: ', reg.coef_)

In [None]:
prediction = reg.predict(x_train)
residual = (y_train - prediction)
# print(residual)

plt.plot(x_train, residual, 'o', color='red');

In [None]:
print('MSE: ', mean_squared_error(y_train, prediction, squared = False))
print('MAE: ', mean_absolute_error(y_train, prediction))

In [None]:
degree=9
polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyreg.fit(x_train, y_train)


In [None]:
lin_reg = LinearRegression()
lin_reg.fit(polyreg, y_train)
y_pred = lin_reg.predict(polyreg)