**Importing CSV Data**

In [None]:
import pandas as pd
import numpy as np

In [None]:
path="https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"

In [None]:
df=pd.read_csv(path , header=None)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns=['buying','maint','doors', 'persons','lug_boot','safaty','class']

**Basic insights from data**

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.describe(include='all')

In [None]:
df.info()

**Accessing database using DB Api**

In [None]:
import MySQLdb

In [None]:
#create a connection object
connection=MySQLdb.connect(host='localhost',user='root',password='')
#create a cursor object
cursor=connection.cursor()
#run a query
cursor.execute('Create database library')
#Free resources
cursor.close()
connection.close()

**Data Preprocessing**

Data preprocessing is the process of converting or mapping data from the initia form to another format, in order to prepare the data for further analysis. Also called data cleaning and data wrangling

**Dealing with missing values**

In [None]:
# dictionary of lists
dict = {'First Score':[100, np.nan, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, np.nan, 80, 98],
        'Fourth Score':[60, 67, 68, 65]}
 
# creating a dataframe from dictionary 
df = pd.DataFrame(dict)
 


In [None]:
df

In [None]:
# using dropna() function    
df.dropna(axis = 0)

In [None]:
# using dropna() function    
df.dropna(axis = 0,inplace=True)

In [None]:
# using dropna() function    
df.dropna(subset=["Second Score"], axis = 0)

Replacing missing values

In [None]:
df["First Score"].replace(np.nan,df["First Score"].mean(),inplace=True)

In [None]:
df["Second Score"].replace(np.nan,df["Second Score"].mean(),inplace=True)

In [None]:
df["Third Score"].replace(np.nan,df["Third Score"].mean(),inplace=True)

In [None]:
df

**Data Formatting**

In [None]:
df.dtypes

In [None]:
df["Fourth Score"]=df["Fourth Score"].astype("float")

In [None]:
df.dtypes

**Data Normalization** 

Methods of Normalization
1st: Simple feature scalling
2nd: Min max 
3rd: Z-Score 

1st Method

In [None]:
df["large_vals"]=[100023,455656,890987,345678]

In [None]:
df["large_vals"]=df["large_vals"]/df["large_vals"].max()

In [None]:
df

2nd Method

In [None]:
df["large_vals"]=(df["large_vals"]- df["large_vals"].min())/(df["large_vals"].max() - df["large_vals"].min())

In [None]:
df

3rd Method

In [None]:
df["large_vals"]=(df["large_vals"]- df["large_vals"].mean())/df["large_vals"].std()

In [None]:
df

**Binning : group values into bins**

In [None]:
bins= np.linspace(min(df["Third Score"]) , max(df["Third Score"]),4)
group_names=['low','medium','high']
df["score_binned"]=pd.cut(df["Third Score"],bins,labels=group_names,include_lowest=True)

In [None]:
df

**Categorical variables**

In [None]:
df['std_cat']=['A','B','C','A']

In [None]:
df1=pd.get_dummies(df['std_cat'])

In [None]:
df=pd.concat([df,df1],axis=1)

In [None]:
df

**EDA**

It is an approach to analyze data in order to summarise main characteristics of data , gain better understanding of dataset, uncover relationship between variables and extract important features. Mainly its purpose is to find the charecteristics that most affect target variaable.

**Discriptive Statistics**

In [None]:
df=pd.read_csv('houseprice.csv')

In [None]:
df

In [None]:
df.describe(include="all")

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.drop("ADDRESS",axis=1,inplace=True)

In [None]:
df.drop("POSTED_BY",axis=1,inplace=True)

In [None]:
df.head()

**Summarising the categorical data**

In [None]:
valueOfCount=df["BHK_OR_RK"].value_counts().to_frame()

In [None]:
valueOfCount

**BOX PLOT**

They are great way of visualising numeric data, since we can visualze various distribution of data. The main features tha box plot shows median,upper quartile,lower quartile, upper extreme, lower extreme, whisker and also outliers.

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(x="BHK_OR_RK",y="TARGET(PRICE_IN_LACS)" ,data=df, whis=[5,95])

**Scatter Plot**

They show the relationship between two variables

In [None]:
import matplotlib.pyplot as plt

In [None]:
x=df["SQUARE_FT"]
y=df["TARGET(PRICE_IN_LACS)"]
plt.scatter(x,y)
plt.xlabel("Sq ft")
plt.ylabel("Price")
plt.show()


In [None]:
df.describe()

In [None]:
df['RERA'].unique()

In [None]:
df['BHK_NO.'].unique()

In [None]:
df['READY_TO_MOVE'].unique()

In [None]:
df['UNDER_CONSTRUCTION'].unique()

**Heat Map**

In [None]:
# Program to plot 2-D Heat map
# using seaborn.heatmap() method
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
  
data_set = np.random.rand( 10 , 10 )
ax = sns.heatmap( data_set , linewidth = 0.5 , cmap = 'coolwarm' )
  
plt.title( "2-D Heat Map" )
plt.show()

In [None]:
df.columns

In [None]:
# selecting a couple columns
couple_columns = df[['TARGET(PRICE_IN_LACS)','BHK_NO.', 'READY_TO_MOVE']]
couple_columns.head()

In [None]:
# this is essentially would be taking the average of each unique combination. 
# one important mention is notice how little the data varies from eachother.
phase_1_2 = couple_columns.groupby(['BHK_NO.', 'READY_TO_MOVE']).mean()
print(phase_1_2.shape)
phase_1_2.head(10)

In [None]:
phase_1_2 = phase_1_2.reset_index()
phase_1_2.head()

In [None]:
import numpy as np;
import seaborn as sns; 

# To translate into Excel Terms for those familiar with Excel
# string 1 is row labels 'helix1 phase'
# string 2 is column labels 'helix 2 phase'
# string 3 is values 'Energy'
# Official pivot documentation
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pivot.html

phase_1_2.pivot('BHK_NO.', 'READY_TO_MOVE', 'TARGET(PRICE_IN_LACS)').head()

In [None]:
plt.figure(figsize=(9,9))
pivot_table = phase_1_2.pivot('BHK_NO.', 'READY_TO_MOVE','TARGET(PRICE_IN_LACS)')
plt.xlabel('BHK_NO.', size = 15)
plt.ylabel('READY_TO_MOVE', size = 15)
plt.title('Price of Homes wrt BHK NO', size = 15)
sns.heatmap(pivot_table, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');

**Correlation**

Measures to what extent different variables are interdependent

In [None]:
sns.regplot(x="BHK_NO.",y="TARGET(PRICE_IN_LACS)", data=df)
plt.ylim(0,)

Therefor weak correlation between the two variables

Various correlation statistical methods

1st Pearson Correlation: It gives us correlation coefficiant and P-value

![PEARSON](pearsoncorrelation.png)


In [None]:
from scipy import stats

In [None]:
pearson_co,p_val=stats.pearsonr(df["BHK_NO."],df["TARGET(PRICE_IN_LACS)"])

In [None]:
pearson_co

In [None]:
p_val

**Correlation Heatmap**

In [None]:
print(df.corr())

# plotting correlation heatmap
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)

# displaying heatmap
plt.show()


**Model Development**

**Simple Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm=LinearRegression()

In [None]:
X=df[["BHK_NO."]]
Y=df["TARGET(PRICE_IN_LACS)"]

In [None]:

lm.fit(X,Y)

In [None]:
results=lm.predict(X)

In [None]:
lm.intercept_

In [None]:
lm.coef_

**Multiple Linear Regression**

In [None]:
lm=LinearRegression()
X=df[["BHK_NO.","SQUARE_FT","LONGITUDE","LATITUDE"]]
Y=df["TARGET(PRICE_IN_LACS)"]

In [None]:
lm.fit(X,Y)

In [None]:
lm.predict(X)

In [None]:
results=lm.predict(X)

In [None]:
lm.coef_

In [None]:
lm.intercept_

**Regression Plot**

In [None]:
sns.regplot(x="BHK_NO.",y="TARGET(PRICE_IN_LACS)",data=df)

**Residual Plot Represent the error between the actual value**

In [None]:
sns.residplot(df["SQUARE_FT"],df["TARGET(PRICE_IN_LACS)"])

Distribution Plot : Compare the distribution

In [None]:
axl=sns.distplot(df["TARGET(PRICE_IN_LACS)"],hist=False,color="r",label="Actual Values")
sns.distplot(results,hist=False,color="b",label="Predicted Value",ax=axl)

**Polynomial Regression**

In [None]:
X=df["BHK_NO."]
f=np.polyfit(X,Y,3)

In [None]:
p=np.poly1d(f)

In [None]:
print(p)

Polynomial Regression with more than one dimention

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pr=PolynomialFeatures(degree=2,include_bias=False)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scale=StandardScaler()

In [None]:
scale.fit(df[["BHK_NO.","SQUARE_FT"]])

In [None]:
x_scale=scale.transform(df[["BHK_NO.","SQUARE_FT"]])

In [None]:
x_scale

**Pipelines**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
Input=[("scale",StandardScaler()),("polynomial",PolynomialFeatures(degree=2)),("model",LinearRegression())]

In [None]:
pip=Pipeline(Input)

In [None]:
pip.fit(X,Y)

In [None]:
results=pip.predict(X)

In [None]:
results

**Model Evaluation**

**MSE**

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(df["TARGET(PRICE_IN_LACS)"], results)

**R-Squared**

In [None]:
lm.fit(X,Y)

In [None]:
lm.score(X,Y)

**Train_test_split() Method**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.columns

In [None]:
X=df[['UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'SQUARE_FT',
       'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE']]
Y=df["TARGET(PRICE_IN_LACS)"]

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

**Cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
X=df[['UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'SQUARE_FT',
       'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE']]
Y=df["TARGET(PRICE_IN_LACS)"]
lr=LinearRegression()

In [None]:
scores=cross_val_score(lr,X,Y,cv=2)

In [None]:
np.max(scores)

Underfitting Where a model is too simple to fit the data

Overfitting where a model is too flexible and fits the noise rather than a function

**Model Selection which polynomial degree is best fit**

In [None]:
X_train

In [None]:
X_test.shape

In [None]:
x_test_pr.shape

In [None]:
x_train_pr.shape

In [None]:
y_test.shape

In [None]:
y_train.shape

In [None]:
resq_test=[]
order=[1,2,3,4]
for i in order:
    pr=PolynomialFeatures(degree=i)
    x_train_pr=pr.fit_transform(X_train[["SQUARE_FT"]])
    x_test_pr=pr.fit_transform(X_test[["SQUARE_FT"]])
    lr.fit(x_train_pr,y_train)
    resq_test.append(lr.score(x_test_pr,y_test))

In [None]:
resq_test

**Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_model=Ridge(alpha=10)
ridge_model.fit(X,Y)

In [None]:
results=ridge_model.predict(X)

In [None]:
results

In [None]:

ridge_model.score(X,Y)