# Prediction using supervised ML

# Author : Harshit Patel

# Problem statement

-> Predict the percentage of a student based on the number of study hours.

-> This is a simple linear regression task.

-> What will be predicted score if a student studies for 9.25 hrs/ day?

# (1) Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# (2) Importing Dataset

In [None]:
df = pd.read_csv("student_scores.csv")

# (3) Exploratory Data Analysis

# (3.1) Dataset Shape

In [None]:
df.shape

# (3.2) Viewing the Dataset

In [None]:
df.head()

# (3.3) Describing the Dataset

In [None]:
df.describe()

# (3.4) Checking any missing values

In [None]:
df.isnull().sum()

# (3.5) Detail information of Dataset

In [None]:
df.info()

# (4) Data Visualization

In [None]:
df.plot(x='Hours',y='Scores',style='o')
plt.title('Hours vs Percentage')
plt.xlabel('Hours Studied')
plt.ylabel('Percentage Score')
plt.show()

# (5) Data Processing

In [None]:
#Division of data into attributes(input) and labels(output)
x = df.iloc[:,:-1].values
y = df.iloc[:,1].values

# (6) Model Training

In [None]:
#Spliting the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
regression_model = LinearRegression()
regression_model.fit(x_train,y_train)

# (7) Plotting the Regression line

In [None]:
line = regression_model.coef_*x+regression_model.intercept_
plt.scatter(x,y)
plt.title('Line of Regression')
plt.xlabel('Hours Studied')
plt.ylabel('Percentage Score')
plt.plot(x, line, c='g');
plt.show()

In [None]:
#coefficients for the prediction of each of the targets
print(regression_model.coef_)

# (8) Making Predictions

In [None]:
y_pred_train = regression_model.predict(x_train)
y_pred_test = regression_model.predict(x_test)
print(x_test)

# (9) Comparision of Actual and Predicted Model result

In [None]:
df = pd.DataFrame({'Actual' : y_test, 'Predicted' : y_pred_test})
df

In [None]:
#Estimating training and test score
regression_model.score(x_train, y_train)

In [None]:
regression_model.score(x_test, y_test)

In [None]:
df.plot(kind = 'bar', figsize = (5,5))
plt.grid(which = 'major', linewidth = '0.5', color = 'blue')
plt.grid(which = 'minor', linewidth = '0.5', color = 'red')
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
poly = PolynomialFeatures(degree = 2, interaction_only = True)
y_pred_train = poly.fit_transform(x_train)
y_pred_test = poly.fit_transform(x_test)

poly_clf = linear_model.LinearRegression()
poly_clf.fit(y_pred_train, y_train)
y_pred = poly_clf.predict(y_pred_test)
print(poly_clf.score(y_pred_train, y_train))


print(poly_clf.score(y_pred_test, y_test))


print(x_train.shape)
print(y_pred_train.shape)

# (10) Predicting with user input data

In [None]:
hours = [[float(input())]]
own_pred = regression_model.predict(hours)
print("No of Hours = {}".format(hours[0][0]))
print("Predicted Score (Percentage) = {}".format(min(own_pred[0],100)))

# (11) Model Evaluation

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-2:', metrics.r2_score(y_test, y_pred))

# THANK YOU :)