#Importation
We shall import seed, randrange, reader and sqrt

In [1]:
# Linear Regression With Stochastic Gradient Descent for Wine Quality
from random import seed
from random import randrange
from csv import reader
from math import sqrt

#Load Csv File
We will use the csv module that is part of the standard library to load the file
We will use the load_csv() function that will wrap this behaviour and return the dataset.
Then we will take the loaded dataset as list of lists.
We shall open the file loaded in read only mode.
The reader() takes the file as an argument.
The first list is the list of observation of rows and the second is a list of column valus for a given row.

In [2]:
# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

#Datatype Conversion
Through the str_column_to_float() We shall convert the given column values to floating point value.
We then use strip() to remove any whitespace before the conversion.

In [3]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())


#Finding Minimum And Maximum Values
We use dataset_minmax() with dataset as parameters that will lastly return the dataset. 
We take min ans max values as list of lists.
Use for to wrap the first dataset with the column values in each row.
Assign value_min to min(col_values) and value_max to max(col_values)
We shall then join the minimum col_values and maximum col_values.


In [4]:
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

#Normalisation Of Data
We shall use normalize_dataset() function with parameters dataset and minmax
We use loop each row by taking values in a row and subtracting the min value then dividing by Max value minus minimum value. 
We apply the normalization formula (X - x(min)) / (x(max) - x(min))

In [5]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


#Spliting Data Into K-folds
Each group of a data is called fold.
We use cross_validation_split() function to split a dataset into train and test split. 
We assign to it parameters: dataset and n_folds.
We take dataset_split as list of lists.
We create a copy of the dataset from which to draw randomly chosen rows.
We take the fold_size as total number of rows / total number of folds.
Random rows are removed from the copied dataset to the fold_size until the total number of rows target is achieved.
The randrange() function gives a random integer between 0 and the size of the list.
The list of rows that we have created is added to the folds that is returned at the end.


In [6]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

#Calculating Rmse
We use rmse_metric() function with actual and predicted parameters.
We initialise sum error to 0.0
To get the predicted error we subtract the actual value from the predicted value.
We square the predicted error to avoid having negative values and assign it to sum_error.
We assign mean_error to sum_error / actual value. Then we return sqrt with parameter mean_error at the end.

In [7]:
# Calculate root mean squared error
def rmse_metric(actual, predicted):
	sum_error = 0.0
	for i in range(len(actual)):
		prediction_error = predicted[i] - actual[i]
		sum_error += (prediction_error ** 2)
	mean_error = sum_error / float(len(actual))
	return sqrt(mean_error)

#Evaluation Of An Algorithm Using A Cross Validation Split
We use evaluate_algorithm() function with 3 fixed parameters dataset, algorithm and n_folds.
We split the data into n_folds called folds.
We take scores as list of lists.
We loop each fold, giving it an opportunity to be held out of the traing and be used in evaluating the algorithm.
We split the data into training and test elements.
We loop each row then create a copy of the fold and held out fold is removed from the list. 
The sum() function flattens the folds into a long list of rows to match the algorithms expectations of the training dataset.
A copy of the test_dataset is made and output cleared, to avoid accidental cheating algorithms.
Each output is cleared by setting value to None.
The algorithm is a function that expects the trained and tesy datasets on which to make predictions
We loop the predicted values from each row using the algorithm() function.
The rmse_matric() function has parameters actual and predicted. 
It compares the predicted values to actual values of unmodified test dataset.
The predicted values are stored in a list.


In [8]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		rmse = rmse_metric(actual, predicted)
		scores.append(rmse)
	return scores

#Making Prediction
Let us start by defining predict function using row and coefficients parameters.
Then we initialise inputs as 0 to get predicted values. 
We shall proceed by giving range of rows in array form Later, we iterate getting coefficients as per epoch for each instance. 
Lastly, we shall return the predicted value

In [9]:
# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return yhat

#Estimation Of Linear Regression Coefficients Using SGD
First define coefficients_sgd function with train, l_rate and epoch parameters.
We will use for loop 4 times.
--> use for to set coeffient row(1) to 0. 
--> use for loop to give epoch range set sum error to 0 
--> use for loop to train each row set yhat predict function with row, coef parameters.
square the error gotten to get a positive value define the equation to be used.
--> use for loop to give range of rows. print epoch as an int, l_rate as float and error as float. return the coefficients.

In [10]:
# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		for row in train:
			yhat = predict(row, coef)
			error = yhat - row[-1]
			coef[0] = coef[0] - l_rate * error
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
			# print(l_rate, n_epoch, error)
	return coef


#Linear Regression Algorithm Using SGD
We use linear_regression_sgd() function with train, test, l_rate and n_epoch parameters.
We take predictions as list of lists.
We take the coeffients_sgd() function and assign it coef value.
We loop each row to take the inputs of the prediction in rows and coef.
We join the inputs gotten and pass them through predictions which is later returned.


In [11]:
# Linear Regression Algorithm With Stochastic Gradient Descent
def linear_regression_sgd(train, test, l_rate, n_epoch):
	predictions = list()
	coef = coefficients_sgd(train, l_rate, n_epoch)
	for row in test:
		yhat = predict(row, coef)
		predictions.append(yhat)
	return(predictions)

#Loading And Preparing Data
We assign a file from our directory a value.
We loop each dataset to be read and convert the values from string to float
We normalize the dataset by assigning it minmax value.

In [12]:
# Linear Regression on African Crises dataset
seed(1)
# load and prepare data
filename = 'African_Crises_Project1.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

#Evaluating Algorithm
We assign n_folds, l_rate and n_epoch values.
We call coefficients_sgd() function and output the coefficients.
We call evaluate_algorithm() function and output the scores.
We loop each row to strip the whitspaces.
We output the RMSE as a float.

In [13]:
# evaluate algorithm
n_folds = 5
l_rate = 0.01
n_epoch = 50

coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)


scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
def str_column_to_int(dataset, column):
	for row in dataset:
		row[column] = int(row[column].strip())
print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))

[0.23926865274441833, 0.29856866823060524]
Scores: [0.3392682096863841, 0.3122106838922133, 0.3340352614987483, 0.34963793531969556, 0.3400613084016554]
Mean RMSE: 0.335


#Making A Prediction Using Coefficients
We have our coefficients as [0.2393, 0.2986]
We can plug these coeffients manually in the equation y = B0 + B1*x. 
Where y is our inflation_crises and x our inflation_annual_cpi.Let us predict the values using the coefficients calculated.
First we start by defining predict function having row and coefficents parameters.
Initialise input as 0 to get predicted value.
Give the range of rows in array form iterate getting co-efficients as per epoch for each instance return the predicted value.
We convert string to float in order to multiply.
We then insert the coefficients gotten above to predict the model.
Use for loop to get the predicted output for each row Lastly print the expected output together with the predicted output.



In [16]:
# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return yhat

filename = 'African_Crises_Project1.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
coef = [0.23926865274441833, 0.29856866823060524]
for row in dataset:
	yhat = predict(row, coef)
	print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))

Expected=0.000, Predicted=1.267
Expected=0.000, Predicted=4.464
Expected=0.000, Predicted=-0.871
Expected=0.000, Predicted=3.584
Expected=0.000, Predicted=-0.910
Expected=0.000, Predicted=-6.008
Expected=0.000, Predicted=-0.289
Expected=1.000, Predicted=8.932
Expected=0.000, Predicted=-0.206
Expected=0.000, Predicted=-4.786
Expected=0.000, Predicted=1.398
Expected=0.000, Predicted=4.007
Expected=0.000, Predicted=-3.450
Expected=0.000, Predicted=-0.176
Expected=0.000, Predicted=-4.521
Expected=0.000, Predicted=3.152
Expected=1.000, Predicted=6.874
Expected=0.000, Predicted=5.125
Expected=1.000, Predicted=8.637
Expected=1.000, Predicted=14.075
Expected=1.000, Predicted=12.680
Expected=1.000, Predicted=9.021
Expected=0.000, Predicted=5.668
Expected=1.000, Predicted=20.909
Expected=1.000, Predicted=19.239
Expected=1.000, Predicted=7.289
Expected=0.000, Predicted=0.239
Expected=0.000, Predicted=2.252
Expected=0.000, Predicted=2.125
Expected=0.000, Predicted=-0.056
Expected=0.000, Predicted=

Having y as the inflation_crises and x as inflation_annual_cpi we use the equation y = B0 + B1(x)
Using 1 = 0.2986 + 0.2393(x).
We have annual rate of inflation as 2.9310. 
2.9310 is the annual rate of inflation at which inflation becomes a practical certainty.