In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

'''
The 'ts-course-data' dataset appears to contain information related to courses, possibly in a time series format, with columns representing the year and the availability of course materials in hardcover and paperback formats. The 'year' column likely denotes the academic year or the chronological order of data collection, while the 'hardcover' and 'paperback' columns likely indicate the availability or usage of course materials in those respective formats during each year. This dataset could be valuable for analyzing trends in course material preferences over time, evaluating the popularity of different formats, or understanding shifts in educational resources within a specific academic context.
'''

### Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import  LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

plt.rcParams['figure.figsize']=[8,4]

#### Reading the csv file

In [None]:
df=pd.read_csv('/kaggle/input/ts-course-data/book_sales.csv')
df

'''
'hardcover' column already provides sufficient information about the availability or usage of course materials in physical format, keeping both columns might lead to redundancy and unnecessary duplication of data.
'''

In [None]:
# dropping the paperback column
df.drop(['Paperback'],axis=1,inplace=True)

'''
Setting the index of your DataFrame to the 'Date' column is a crucial step for time series modeling. This ensures that your data is organized chronologically, which is essential for analyzing and forecasting time-dependent patterns. By setting the index to the date, you enable efficient time-based operations such as resampling, slicing, and lagging, which are fundamental in time series analysis.
'''

In [None]:
df.set_index('Date')

'''
Adding a time variable to your dataframe is a crucial step for time series modeling. This allows you to incorporate the temporal aspect into your analysis, enabling your model to understand and leverage patterns over time. With the 'Time' variable representing the sequence of observations, your model can better capture trends, seasonality, and other time-dependent patterns. It's a fundamental component for building accurate and robust time series models.
'''

In [None]:
'''
y(hardware)=w*(time)+b
'''

df['Time']=np.arange(len(df.index))

In [None]:
df.head()

'''
Lag features are essentially creating new columns in a DataFrame by shifting existing columns by a certain number of time steps. These lag features are useful in time series analysis and forecasting tasks as they capture past values of a variable which can be predictive of future values.
'''

In [None]:
'''
Lag Features = Extra feature column
'''
df['Lag1']=df['Hardcover'].shift(1)
df['Lag2']=df['Hardcover'].shift(2)

In [None]:
# Create a scatter plot with a regression line using regplot

sns.regplot(x='Time',y='Hardcover',data=df)

In [None]:
'''
y(hardcover)=w*time+b

y(hardcover)=w1*time+w2*previous_day_sales+w3*previous_day_sales
'''
df.head()

In [None]:
# fill missing values with zero
df=df.fillna(0)

In [None]:
# partioning the data set into training and testing sets
df_train=df.iloc[:20]
df_test=df.iloc[20:]

In [None]:
# separating feature and target variable
X_train=df_train.loc[:,['Time','Lag1','Lag2']]
y_train=df_train.loc[:,['Hardcover']]

X_test=df_test.loc[:,['Time','Lag1','Lag2']]
y_test=df_test.loc[:,['Hardcover']]

X_train

#### Data Preprocessing

In [None]:
#normalization

ss=StandardScaler() 

'''
fit:mean,standard_deviation(fit)
transform: x-mean/standard_deviation
'''

X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

y_train=ss.fit_transform(y_train)
y_test=ss.fit_transform(y_test)

X_train

In [None]:
# Model training using training set
lin_reg=LinearRegression()
lin_reg.fit(X_train,y_train)

In [None]:
X_test

In [None]:
#Predicting the target variable using linear regression model on test dataset
y_pred=lin_reg.predict(X_test)
y_pred

In [None]:
# Parameters of the model
lin_reg.intercept_

In [None]:
lin_reg.coef_

In [None]:
# calculating the mean square error
mean_squared_error(y_pred,y_test)

In [None]:
X_test

In [None]:
lin_reg.predict([[22, 222, 217]])

In [None]:
#Computing R-square score
lin_reg.score(X_test,y_test)

'''
r2 score becomes negative there is no proper relation ship b/w data the model
obtained is not good

'''