In [4]:
# Find top 5 most correlated features to the target label(revenue) and then build a model on top of those 5 features.
# Evaluate the model using MAE, MSE, RMSE and R2 score and then compare the result with the RMSE and R2 you achieved in question 2

# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

# fetching the data from data.csv file
train = pd.read_csv('data.csv')

# working with Numeric Features
numeric_features = train.select_dtypes(include=[np.number])

# finding the correlation with the numeric functions
corr = numeric_features.corr()

# top 5 correlated features with the label revenue
print(corr['revenue'].sort_values(ascending=False)[:6], '\n')
print(corr['revenue'].sort_values(ascending=False)[-5:], '\n')
most_cor = ['P2', 'P6', 'P11', 'P21', 'P28', 'P34', 'P10', 'P8', 'P13', 'P29']

# find out the number of null values for the features
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending=False))
nulls.columns = ['Null Count']
nulls.index.name = 'Feature'

# handling missing or null value
data = train.select_dtypes(include=[np.number]).interpolate().dropna()
# print(sum(data.isnull().sum() != 0))

# building a multiple linear model
y = np.log(data.revenue)
X = data.drop(['revenue'], axis=1)
X = X[most_cor]

# splitting the data into test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)
from sklearn import linear_model
lr = linear_model.LinearRegression()

# training the model by using fit method
model = lr.fit(X_train, y_train)

# evaluating the performance of the model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
y_actual = model.predict(X_test)                                       # predicting test data
print ('MAE is obtained as : ' + str(mean_absolute_error(y_test, y_actual))) # MAE
print ('MSE is obtained as : ' + str(mean_squared_error(y_test, y_actual)) ) # MSE
print("\nR^2 is obtained as : ", r2_score(y_test, y_actual))           # R2 score
def RMSE(predict, target):
    return np.sqrt(((predict - target) ** 2).mean())
print("RMSE is obtained as : ", RMSE(y_test, y_actual))  # RMSE

revenue    1.000000
P2         0.191518
P28        0.155534
P6         0.139094
P21        0.097411
P11        0.084247
Name: revenue, dtype: float64 

P34   -0.072343
P10   -0.073220
P8    -0.084215
P13   -0.105085
P29   -0.114846
Name: revenue, dtype: float64 

MAE is obtained as : 0.37646032884575287
MSE is obtained as : 0.23693702726322638

R^2 is obtained as :  0.013855615047432934
RMSE is obtained as :  0.48676177670727844
