IMPORTING DATA AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("training.csv")
test = pd.read_csv("testing.csv")

In [None]:
train.shape

In [None]:
train.corr

In [None]:
test.shape

PRE-PROCESSING AND DATA CLEANING

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
for columns in train:
    if(train[columns].isna().sum() > 0):
        train[columns] = train[columns].fillna(train[columns].mean())

In [None]:
for columns in test:
    if(test[columns].isna().sum() > 0):
        test[columns] = test[columns].fillna(test[columns].mean())

In [None]:
for columns in train.select_dtypes(['object']):
    train[columns] = le.fit_transform(train[columns])

In [None]:
for columns in test.select_dtypes(['object']):
    test[columns] = le.fit_transform(test[columns])

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
y = train['price_doc']
x = train.drop(columns=['price_doc'])

PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn import datasets

std_slc = StandardScaler()

In [None]:
train_std = std_slc.fit_transform(x)
test_std = std_slc.fit_transform(test)
print(train_std)
train_std.shape

Created a PCA of variance 95%

In [None]:
train_std.corr

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.85, whiten=True)
train_pca = pca.fit_transform(train_std)
test_pca = pca.fit_transform(test_std)


In [None]:
print("original number of features",train_std.shape[1])
print("reduced number of feature",train_pca.shape[1])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_pca, y, test_size=0.16)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)

In [None]:
predictions = lin_reg.predict(x_test)
predictions
lin_reg.score(x_test,y_test)


LINEAR REGRESSION MODEL EVALUATION

In [None]:


print("MSE = %.3f" %metrics.mean_squared_error(y_test, predictions))
print("R-squared = %.3f" %metrics.r2_score(y_test, predictions))

In [None]:
from sklearn.metrics import mean_absolute_error
mae= mean_absolute_error(predictions,y_test)
print(mae)

In [None]:
output = pd.read_csv("sample.csv")

In [None]:
test_pca

In [None]:
pre = lin_reg.predict(test_pca)


In [None]:
output["price_doc"] = pre

In [None]:
output.to_csv('LReg.csv', index = False)

POLYNOMIAL REGRESSION

In [None]:
xx = train.iloc[:,:-1]
yy = train.iloc[:,-1]

print(xx.columns)
print(yy.name)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2,interaction_only=False)
X2 = poly.fit_transform(xx)
print(X2.shape)
print(poly.get_feature_names_out())

In [None]:
X_train1, X_test1, y_train1,  y_test1 = train_test_split(xx,yy, test_size = 0.16)
poly_reg = LinearRegression()
poly_reg.fit(X_train1, y_train1)
#print(poly_reg.coef_)
print(poly_reg.intercept_)
print(poly_reg.score(X_test1,y_test1))
y_predict = poly_reg.predict(X_test1)
print("MSE = %.3f" %metrics.mean_squared_error(y_test1, y_predict))
print("R-squared = %.3f" %metrics.r2_score(y_test1, y_predict))

REGRESSION TREE

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressionTree = DecisionTreeRegressor()
regressionTree.fit(x_train,y_train)
y_predict = regressionTree.predict(x_test)
print("MSE = %.3f" %metrics.mean_squared_error(y_test, y_predict))
print("R-squared = %.3f" %metrics.r2_score(y_test, y_predict))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train)
X_train_norm = scaler.transform(x_train)
X_test_norm = scaler.transform(x_test)
X_train = pd.DataFrame(X_train_norm, columns=x_train.columns)
X_test = pd.DataFrame(X_test_norm, columns=x_test.columns)

RANDOM FOREST (REGRESSION ENSEMBLE)

In [None]:
from sklearn.ensemble import RandomForestRegressor

randForest = RandomForestRegressor()
randForest.fit(x_train,y_train)
y_predict = randForest.predict(x_test)
print("MSE = %.3f" %metrics.mean_squared_error(y_test, y_predict))
print("R-squared = %.3f" %metrics.r2_score(y_test, y_predict))

ADABOOST (REGRESSION ENSEMBLE)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

adaBoost = AdaBoostRegressor()
adaBoost.fit(x_train,y_train)
y_predict = adaBoost.predict(x_test)
print("MSE = %.3f" %metrics.mean_squared_error(y_test, y_predict))
print("R-squared = %.3f" %metrics.r2_score(y_test, y_predict))

GRADIENT BOOSTING REGRESSION (REGRESSION ENSEMBLE)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

graBoost = GradientBoostingRegressor()
graBoost.fit(x_train,y_train)
y_predict = graBoost.predict(x_test)
print("MSE = %.3f" %metrics.mean_squared_error(y_test, y_predict))
print("R-squared = %.3f" %metrics.r2_score(y_test, y_predict))