In [None]:
# Importing libraries

# For numerical computation on nd-arrays
import numpy as np

# For data analysis and manipulations with dataset
import pandas as pd

# Data visualization library
import matplotlib.pyplot as plt

# Data visualization library built upon matplotlib
import seaborn as sns
 
# To ignore warnings related to versions mismatch or updates
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Using this, the output of plotting commands is displayed inline within this notebook
%matplotlib inline

# This class contains methods to perform analysis and find insights
# Some generalizations has been done according to all Datasets
class Analysis:
    
    # These class variables have been assigned as per the general behaviour of each dataset
    # Can easily be updated for an individual dataset
    
    # "Price" is the target variable for all the datasets, one which has to be predicted
    target = "Price"
    
    # This is a list of attributes which are not categorical to further help in analyzing categ features
    not_categorical = ["Area", "Price", "Location"]
    
    # These attributes has strongest correlation with target variable in almost all datasets
    strong_corr = ["Area", "No. of Bedrooms"]
    
    # On these numeric attributes, outlier removal must be done to gain better insights
    remove_outliers = ["Area", "Price"]
    
    def __init__(self, name):
        self.df = pd.read_csv(name)
        self.df_num = self.df.select_dtypes(include = ['float64', 'int64'])
        print(f"Shape - {self.df.shape}")
        
        # Remove duplicate rows
        print("Removed duplicate rows")
        self.df.drop_duplicates(inplace=True)
        print(f"Shape after removing duplicate rows - {self.df.shape}")
        
        #Check for missing values
        for index, value in self.df.isnull().sum().iteritems():
            if value > 0:
                print(f"There are {value} missing values in column - {index}")
        else:
            print("No missing values in dataset!")
    
    def PlotTargetVar(self):
        print("Distribution of target variable\n")
        print("Skewness: %f" % self.df[Analysis.target].skew())
        plt.figure(figsize=(10, 10))
        sns.distplot(self.df[Analysis.target], bins=100, color='r')
        
    def PlotFeatures(self):
        print("Distribution of data on each series in the df, resulting in one histogram per column\n")
        self.df_num.hist(figsize=(20, 20), bins=50, xlabelsize=8, ylabelsize=8)
    
    def CorrTarget(self):
        print("Correlation of every attribute with {}".format(Analysis.target))
        price_corr = self.df_num.corr()[Analysis.target][1:]
        print(price_corr.sort_values(ascending=False))
        #Adding strongly correlated values for plotting regplots
        for attr, corr in price_corr[abs(price_corr) > 0.3].sort_values(ascending=False).iteritems():
            if attr not in Analysis.strong_corr:
                Analysis.strong_corr.append(attr)
    
    def CorrFeatures(self):
        print("Analyzing feature to feature relationships\n")
        plt.figure(figsize=(24, 20))
        sns.heatmap(self.df_num.corr(), annot=True, annot_kws={"size": 8});
        
    def CategoricalFeatures(self):
        print("Lets look at categorical features distribution\n")
        df_cat = self.df.drop(columns=Analysis.not_categorical)
        for col in list(df_cat) :
            print(df_cat[col].value_counts())
            print('-' * 100)
        fig, axes = plt.subplots(round(len(df_cat.columns) / 3), 3, figsize=(6, 15))
        for i, ax in enumerate(fig.axes):
            if i < len(df_cat.columns):
                sns.countplot(x=df_cat.columns[i], data=df_cat, ax=ax);
        fig.tight_layout()
    
    def TargetAnalysisLoc(self):
        print("Top 5 locations with highest house prices and lowest house prices\n")
        ltop = self.df.groupby("Location")[Analysis.target].sum().sort_values(ascending=False)[:5]
        lbot = self.df.groupby("Location")[Analysis.target].sum().sort_values(ascending=True)[:5]
        loc = ltop.append(lbot)
        loc.plot.bar(figsize=(30,15))
        plt.tick_params(axis='both', labelsize=18)
        plt.xticks(rotation=45)
    
    def StrongCorrRegplot(self):
        print("Plotting data and linear regression model fit for strongly correlated values with", Analysis.target)
        rows = round(len(list(Analysis.strong_corr)) / 3)
        cols = 3
        fig, ax = plt.subplots(rows, cols, figsize=(cols*4,rows*3))
        for i, ax in enumerate(fig.axes):
            if i < len(Analysis.strong_corr):
                sns.regplot(x=Analysis.strong_corr[i], y='Price', data=self.df, ax=ax,
                                       scatter_kws={'s':6, 'alpha':0.8, 'color':'gray'},
                                line_kws={'lw':2, 'color':'black', 'linestyle':'dashed'})
    
    def OutlierAnalysis(self, columns=[]):
        Analysis.remove_outliers.extend(columns)
        
        # We'll perform outlier analysis on Price and Area using IQR score
        for col in Analysis.remove_outliers:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            self.df = self.df[~((self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR))]
        print("Removed outliers from ", end = '')
        for col in Analysis.remove_outliers:
            print(col, end = ' ')
        print(f"\nShape after removing outliers - {self.df.shape}")
            
    def CategorialToTarget(self):
        print(f"Relation to {Analysis.target} for all categorical features")
        df_cat = self.df.drop(columns=Analysis.not_categorical)
        rows = round(len(list(df_cat)) / 3)
        cols = 3
        fig, ax = plt.subplots(rows, cols, figsize=(cols*4,rows*3))
        for i, ax in enumerate(fig.axes):
            if i < len(df_cat.columns):
                sns.boxplot(x=list(df_cat)[i], y=Analysis.target, data=self.df, ax = ax)
        fig.tight_layout()

In [None]:
City = Analysis("Datasets/?.csv")

# Top 5 rows
City.df.head()

In [None]:
# Information about dataframe and datatype of each attribute  
City.df.info(verbose = True)

In [None]:
# It gives us all the statistical measures
City.df[["Price", "Area"]].describe()

In [None]:
# To gain better insights, let's remove outliers from attributes- price and area
# You can also add parameter as a list of attributes for which outlier analysis has to be done
# By default, Price and Area as already been added to the list
City.OutlierAnalysis()

In [None]:
# Plots the distribution of target variable
# You can change the target variable using City.target = target_var, by default it's Price
City.PlotTargetVar()

In [None]:
# Now let's perfrom distribution of data on each feature for better analysis
City.PlotFeatures()

In [None]:
# Now, let's find columns with strong correlation to target
City.CorrTarget()

In [None]:
# Analyse strongly related features to target variable
City.StrongCorrRegplot()

In [None]:
# Analysis of feature to feature relationships using a heatmap
City.CorrFeatures()

In [None]:
# Let's take a look at each categorical feature
City.CategoricalFeatures()

In [None]:
# Also analyze their relationship to target variable
City.CategorialToTarget()

In [None]:
# Let's analyze how house price varies according to locations
City.TargetAnalysisLoc()