## Import Libraries

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingRegressor, VotingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 
import plotly.graph_objects as go
import scipy.stats as stats

import mplcyberpunk

import warnings
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

In [2]:
sns.set(style='darkgrid', font_scale=1.4)
plt.style.use("cyberpunk")

# 1.0 Basic Data Cleaning and Feature Exploratory

In [3]:
# Import data

data = "melb_data.csv"
df=pd.read_csv(data)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
# Summary information of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null float64
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null float64
Bedroom2         13580 non-null float64
Bathroom         13580 non-null float64
Car              13518 non-null float64
Landsize         13580 non-null float64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null float64
dtypes: float64(12), int64(1), object(8)
memory usage: 2.2+ MB


### 1.1 Descriptive Summary
The `df.describe()` function provides summary statistics that can be used for various operations and analysis. Here are some common operations that can be performed using the information obtained from `df.describe()`:

1. **Data Cleaning:** By examining the count of each column, you can identify missing values. If a column has a count less than the total number of rows in the DataFrame, you can decide how to handle those missing values, such as imputing them with appropriate values or removing rows or columns with missing data.

2. **Data Exploration:** Descriptive statistics like mean, standard deviation, minimum, and maximum can give you insights into the distribution and variability of the data. You can identify the range of values, detect potential outliers, and understand the spread of data in each column. This information can help you explore and understand the characteristics of your dataset.

3. **Data Validation:** Summary statistics can be used to validate the data and check for any anomalies or inconsistencies. For example, if you have domain knowledge or expectations about the data, you can compare the minimum and maximum values to ensure they fall within the expected range. If any values seem unreasonable, it may indicate errors in the data.

4. **Feature Engineering:** Summary statistics can guide feature engineering decisions. For instance, if you observe a large difference between the mean and maximum values in a column, it suggests the presence of outliers. You can then consider applying transformations or creating new features to handle these outliers or capture the skewed distribution of the data.

5. **Data Visualization:** Summary statistics provide a high-level understanding of the data, and they can be used to create visualizations. Box plots, histograms, or bar charts based on the quartiles, mean, and standard deviation can help visualize the distribution and variability of the data. These visualizations can aid in communicating insights and patterns effectively.

6. **Modeling Decisions:** Descriptive statistics provide important insights for modeling decisions. For instance, understanding the range and distribution of the target variable can help determine the appropriate modeling approach, such as classification or regression. Additionally, identifying potential outliers or highly variable features can guide decisions on data preprocessing techniques or feature selection methods.

Overall, the summary statistics obtained from `df.describe()` serve as a starting point for various data operations, including data cleaning, exploration, validation, feature engineering, visualization, and modeling decisions. They provide valuable information for understanding and manipulating the data to achieve the desired analytical goals.

In [5]:
# Numeric features from the dataset
numeric_feat=df.select_dtypes(include=[np.number])
numeric_feat.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,1465000.0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,850000.0,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,1600000.0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [None]:
# Summary statistics for Numeric features
df.describe()

In [None]:
# Categorical features from the dataset
cat_feat=df.select_dtypes(include=[object])

In [None]:
# Summary statistics for Categorical features
df.describe(include=['O'])

**Key**

Interpreting the information provided by `df.describe()` involves understanding the statistics generated for each column of the DataFrame. Here's a guide on how to interpret the information:

1. **Count:** The count tells you the number of non-missing values in each column. If the count is less than the total number of rows in the DataFrame, it indicates the presence of missing values.

2. **Mean:** The mean represents the average value of the data in each column. It gives you an idea of the central tendency of the data. For example, if the mean of a column is 50, it suggests that the average value in that column is around 50.

3. **Standard Deviation:** The standard deviation measures the spread or dispersion of the data around the mean. A higher standard deviation indicates that the data points are more spread out, while a lower standard deviation suggests that the data points are closer to the mean. It helps understand the variability within the column.

4. **Minimum and Maximum:** The minimum and maximum values provide the range of values present in each column. The minimum value is the smallest value, while the maximum value is the largest value. They help identify the lower and upper bounds of the data.

5. **Quartiles (25th, 50th, 75th percentiles):** Quartiles divide the data into four equal parts, providing insights into the distribution of the data. The 25th percentile (Q1) represents the value below which 25% of the data falls, the 50th percentile (Q2) is the median or middle value, and the 75th percentile (Q3) indicates the value below which 75% of the data falls. Quartiles help understand the spread of data and identify potential outliers.

By examining these statistics, you can gain insights into the central tendency, variability, range, and distribution of the data in each column. It helps you understand the characteristics of the dataset and make informed decisions regarding data cleaning, feature engineering, and modeling approaches.

**Research:**
What the insights are gathered here?

### 1.2 Null Values

In [None]:
# Total amount of null values on column/feature
df.isnull().sum()

In [None]:
# Percentage null values
(df.isnull().sum() / len(df)) * 100

**Research:**

The Car, BuildingArea, YearBuilt and CouncilArea have null values.

### 1.3 Check for Duplicates

In [None]:
df.duplicated().sum()

---

# 2.0 Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA) graphs are visualizations used to gain insights into the underlying patterns, distributions, relationships, and anomalies within a dataset. They play a crucial role in the data analysis process and provide several key benefits:

1. **Data Understanding:** EDA graphs help in understanding the structure and characteristics of the data. They allow you to observe the data's shape, spread, and central tendency, providing a visual representation of the data distribution. This understanding is essential for making informed decisions and formulating appropriate analysis strategies.

2. **Pattern Identification:** EDA graphs reveal patterns and trends within the data. By visualizing the data, you can identify recurring patterns, cycles, or seasonality. This information can help in detecting dependencies, correlations, or underlying relationships that might not be apparent from raw data.

3. **Outlier Detection:** Outliers are data points that significantly deviate from the rest of the data. EDA graphs, such as box plots or scatter plots, can highlight these anomalies. Outliers may indicate errors, data quality issues, or rare events that need to be investigated further.

4. **Data Distribution:** Histograms and density plots provide insights into the distribution of a variable. They help identify whether the data follows a normal distribution or if it is skewed or multimodal. Understanding the data distribution can guide decisions related to modeling assumptions and appropriate statistical techniques.

5. **Feature Selection:** EDA graphs can assist in feature selection by visualizing the relationships between variables. Scatter plots and correlation matrices can reveal correlations or dependencies among variables. This information aids in identifying important features for modeling or feature engineering.

6. **Data Imbalance:** In classification problems, EDA graphs can expose class imbalances, where the number of observations in different classes is significantly different. Imbalanced data can lead to biased models, and EDA graphs can help identify the need for data resampling techniques or alternative modeling approaches.

7. **Visualization of Time Series Data:** Time series plots can show how a variable changes over time. EDA graphs can reveal trends, seasonal patterns, or unusual behavior in time-dependent data. These insights are valuable for forecasting, anomaly detection, and decision-making in various domains.

8. **Communication and Reporting:** EDA graphs are effective tools for communicating findings and insights to stakeholders or team members. Visual representations of data are often easier to interpret and understand than raw numbers or statistical measures. EDA graphs enable clear and concise communication of complex data patterns.

Overall, EDA graphs provide a visual framework for exploring and understanding the data. They complement statistical analyses, uncover hidden patterns, identify outliers, guide modeling decisions, and facilitate effective communication of insights. EDA graphs are an integral part of the exploratory data analysis process and contribute significantly to the overall data analysis workflow.

When analyzing a dataset with the given columns, you can ask various questions to gain insights and understand the properties in the dataset. Here are some potential questions you can explore:

1. **What is the distribution of property prices?**
   - Analyze the `Price` column to understand the range, mean, median, and spread of property prices.
   - Identify any outliers or unusual patterns in the price distribution.

2. **How does the number of rooms (`Rooms`) relate to property prices (`Price`)?**
   - Explore the relationship between the number of rooms and the property prices.
   - Determine if there is a positive correlation between the two variables.

3. **What are the different types of properties (`Type`) in the dataset?**
   - Identify the unique property types and their respective frequencies.
   - Analyze how the property type affects the prices and other attributes.

4. **Which suburbs (`Suburb`) have the highest property prices?**
   - Determine the suburbs with the most expensive properties.
   - Visualize the distribution of property prices across different suburbs.

5. **Are there any temporal trends in property prices (`Price`) over time (`Date`)?**
   - Analyze how property prices change over different dates or time periods.
   - Identify any seasonal or long-term trends in the data.

6. **What is the average land size (`Landsize`) and building area (`BuildingArea`) for different property types?**
   - Compare the average land size and building area among different property types.
   - Identify any variations or trends in the sizes of properties.

7. **Is there a relationship between property attributes, such as the number of bedrooms (`Bedroom2`), bathrooms (`Bathroom`), and car spaces (`Car`)?**
   - Investigate how the number of bedrooms, bathrooms, and car spaces are related.
   - Determine if certain combinations of these attributes are more common.

8. **Which council areas (`CouncilArea`) have the highest number of properties (`Propertycount`)?**
   - Identify the council areas with the highest property counts.
   - Analyze if property counts correlate with other factors such as location or property prices.

9. **Are there any spatial patterns in property locations (`Lattitude`, `Longtitude`) and prices (`Price`)?**
   - Visualize the property locations on a map using latitude and longitude coordinates.
   - Explore if there are any spatial patterns or clusters in property prices.

10. **How does the distance to a particular location (`Distance`) affect property prices (`Price`)?**
    - Analyze the relationship between the distance to a specific location and property prices.
    - Determine if there is a correlation or any notable patterns.

These are just a few examples of the questions you can ask during your analysis. Depending on your specific goals and interests, you can further refine these questions or explore additional aspects of the dataset.

### What is the distribution of property prices?

1. **Histogram:**
   - Identifies the distribution of a variable, including its shape (symmetric, skewed, bimodal, etc.).
   - Reveals the central tendency and spread of the data.
   - Highlights the presence of outliers or unusual patterns.

In [None]:
#for i in df.columns:
    #sns.histplot(df[i],kde=True,color="y")
    #plt.title("Distribution of "+i)
    #plt.show()

In [None]:
# Figure size
plt.figure(figsize=(20,7))

sns.histplot(df['Price'],kde=True,color="y")
plt.title("Distribution of Price")
plt.show()

**Research:**

The Price is **Right skewed**.


---

### Check for Outlier

2. **Box Plot:**
   - Identifies the distribution of a variable, including the median, quartiles, and outliers.
   - Highlights potential outliers in the data.
   - Helps in comparing the distributions of different variables or groups.

In [None]:
for i in df.columns:
    if i in numeric_feat:
        plt.figure(figsize=(20,7))
        fig=px.box(df, y=[i])
        fig.show()

In [None]:
outliers=['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

**Insight:**

All the numeric features:- Rooms, Price, Distance, Postcode, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Lattitude, Longtitude and Propertycount, have outliers.

---

### How does the number of rooms (Rooms) relate to property prices (Price)?

**Scatter Plot:**
   - Identifies the relationship and correlation between two variables.
   - Reveals patterns, trends, or clusters in the data.
   - Highlights potential outliers or anomalies.

In [None]:
# Create a scatter plot
plt.figure(figsize=(18, 15))

sns.scatterplot(x=df["Rooms"],y=df["Price"])
plt.title("Rooms vs Price Distribution")

### Is there a relationship between property attributes, such as the number of bedrooms (Bedroom2), bathrooms (Bathroom), and car spaces (Car)?

In [None]:
# Select the relevant columns for analysis
attributes = ['Bedroom2', 'Bathroom', 'Car']
property_data = df[attributes]

# Create a scatter matrix plot
sns.pairplot(property_data)
plt.show()

# Calculate the correlation matrix
correlation_matrix = property_data.corr()

# Display the correlation matrix
print(correlation_matrix)


### Are there any spatial patterns in property locations (Lattitude, Longtitude) and prices (Price)?

In [None]:
# Scatter plot of property locations and prices
scatter_fig = px.scatter(df, x='Longtitude', y='Lattitude', color='Price')
scatter_fig.update_layout(title='Property Locations and Prices',
                          xaxis_title='Longitude', yaxis_title='Latitude')
scatter_fig.show()

# Spatial heatmap of property prices
heatmap_fig = px.density_mapbox(df, lat='Lattitude', lon='Longtitude', z='Price',
                                radius=10, center=dict(lat=-37.8136, lon=144.9631),
                                zoom=10, mapbox_style='carto-positron')
heatmap_fig.update_layout(title='Spatial Heatmap of Property Prices')
heatmap_fig.show()

### How does the distance to a particular location (Distance) affect property prices (Price)?

In [None]:
# Create a scatter plot or line plot of distance and property prices
plt.figure(figsize=(20, 8))
plt.scatter(df['Distance'], df['Price'])
plt.xlabel('Distance')
plt.ylabel('Price')
plt.title('Distance vs. Property Prices')
plt.show()

# Calculate the correlation coefficient
correlation_coefficient = df['Distance'].corr(df['Price'])
print('Correlation Coefficient:', correlation_coefficient)


### Are there any temporal trends in property prices (Price) over time (Date)?

**Line Plot (Time Series):**

- Identifies the trends and patterns over time.
- Reveals seasonality or cyclic patterns in the data.
- Helps in forecasting or detecting anomalies in time-dependent data.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert 'Date' column to DateTime format
df['Date'] = pd.to_datetime(df['Date'])

# Group the dataset by a specific time period, such as month, quarter, or year and calculate average property prices
monthly_prices = df.groupby(df['Date'].dt.to_period('M'))['Price'].mean()

# Plot the time series of property prices
plt.figure(figsize=(20, 8))
monthly_prices.plot()
plt.xlabel('Date (Month)')
plt.ylabel('Average Price')
plt.title('Temporal Trends in Property Prices')
plt.show()


---

## Distribution of Categorical Features

### Which suburbs (Suburb) have the highest property prices?

**Bar Chart:**
   - Identifies the comparison or distribution of categorical variables.
   - Helps in understanding the frequency or count of each category.
   - Highlights differences or similarities between categories.

In [None]:
import matplotlib.pyplot as plt

# Group by Suburb and calculate average property prices
suburb_prices = df.groupby('Suburb')['Price'].mean().nlargest(40)  # Adjust '10' to the desired number of suburbs to display

# Plot the suburb prices
plt.figure(figsize=(20, 8))
suburb_prices.plot(kind='bar')
plt.xlabel('Suburb')
plt.ylabel('Average Price')
plt.title('Suburbs with Highest Property Prices')
plt.xticks(rotation=90)
plt.show()

### Which council areas (Regionname) have the highest number of properties (Propertycount)?

In [None]:
# Group by Council Area and count properties
property_counts = df.groupby('Regionname')['Propertycount'].count().sort_values(ascending=False)

# Plot the property counts
plt.figure(figsize=(20, 8))
property_counts.plot(kind='bar')
plt.xlabel('Regionname')
plt.ylabel('Property Count')
plt.title('Number of Properties by Regionname')
plt.xticks(rotation=90)
plt.show()

### Which council areas (CouncilArea) have the highest number of properties (Propertycount)?

In [None]:
# Group by Council Area and count properties
property_counts = df.groupby('CouncilArea')['Propertycount'].count().sort_values(ascending=False)

# Plot the property counts
plt.figure(figsize=(20, 8))
property_counts.plot(kind='bar')
plt.xlabel('Council Area')
plt.ylabel('Property Count')
plt.title('Number of Properties by Council Area')
plt.xticks(rotation=90)
plt.show()


In [None]:
plt.figure(figsize=(20, 10))

sns.scatterplot(y=df['Regionname'], x=df['CouncilArea'])
plt.xticks(rotation=90)

### What are the different types of properties (Type) in the dataset?

In [None]:
plt.figure(figsize=(20, 8))

# Create a bar chart
ax_type=sns.countplot(df['Type'])
for p in ax_type.patches:
   ax_type.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))

plt.show()

In [None]:
plt.figure(figsize=(20, 8))


# Create a bar chart
ax_type=sns.countplot(df['Method'])
for p in ax_type.patches:
   ax_type.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))

plt.show()

In [None]:
plt.figure(figsize=(20, 8))


# Create a bar chart
sns.countplot(hue='Method', x='Type', data=df)
plt.title("Method vs Type Distribution")

### What is the average land size (Landsize) and building area (BuildingArea) for different property types?

In [None]:
# Group by Property Type and calculate average land size and building area
average_sizes = df.groupby('Type')['Landsize', 'BuildingArea'].mean()

# Plot the average land size and building area
average_sizes.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Property Type')
plt.ylabel('Average Size')
plt.title('Average Land Size and Building Area by Property Type')
plt.xticks(rotation=0)
plt.legend(loc='upper left')
plt.show()

### Corelational Matrix

6. **Heatmap:**
   - Identifies patterns and relationships between multiple variables.
   - Reveals correlations or dependencies between variables.
   - Highlights clusters or groups of variables with similar characteristics.

In [None]:
# Create a heatmap
corr = df.corr()
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.show()

In [None]:
df_original=df
df_original.head()

# 3.0 Data Cleaning

### 3.1 Dealing Missing values

In [None]:
# Dealing with Numeric features
df.Car=df.Car.fillna(df.Car.median())
df.BuildingArea=df.BuildingArea.fillna(df.BuildingArea.median())
df.YearBuilt=df.YearBuilt.fillna(df.YearBuilt.median())

In [None]:
# Dealing with categorical features
df['CouncilArea']=df['CouncilArea'].fillna(df['CouncilArea'].value_counts().index[0])

### 3.1 Outliers

In [None]:
for i in outliers:
    minimum,q1,middle,q3,maximum=np.quantile(df[i],[0,0.25,0.50,0.75,1])
    IQR=q3-q1
    lower_fence=q1-(IQR*1.5)
    higher_fence=q3+(IQR*1.5)
    print("In "+i+" Column any values beyond the Range "+str(lower_fence)+" and "+str(higher_fence)+" are outliers")

**Insight:**

The range of outliers is huge. Filtering them out won't be the suitable option.

# Feature Engineering

df = pd.get_dummies(df,columns=['Suburb', 'Type', 'Method', 'CouncilArea', 'Regionname'])

# Pre Processing

### Convert Categorical Columns to Numericals

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder=LabelEncoder()

In [None]:
df["Suburb"]=encoder.fit_transform(df[["Suburb"]])
df["Type"]=encoder.fit_transform(df[["Type"]])
df["Method"]=encoder.fit_transform(df[["Method"]])
df["CouncilArea"]=encoder.fit_transform(df[["CouncilArea"]])
df["Regionname"]=encoder.fit_transform(df[["Regionname"]])

In [None]:
df.head()

In [None]:
df.drop(["Address","SellerG","Date"],axis=1,inplace=True)

### Divide Data into Independent and Dependent Features

In [None]:
x=df.drop("Price", axis=1)
y=df["Price"]

In [None]:
x.head()

In [None]:
y

### Scaling the Independent Features

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
x_new=scaler.fit_transform(x)
x=pd.DataFrame(x_new,columns=x.columns)
x.head()

### Divide data into Training and Testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [None]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

# Build a Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Define the model
model=LinearRegression()

# Fit the model
model.fit(x_train,y_train)

# Predict the test data
y_pred=model.predict(x_test)
y_pred

### Check the cost functions with respect to predictions

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
d={}
d["mse"]=mean_squared_error(y_test,y_pred)
d["mae"]=mean_absolute_error(y_test,y_pred)
d["rmse"]=np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
d

### Check r^2 and Adjusted r^2 Scores

In [None]:
from sklearn.metrics import r2_score

In [None]:
d1={}
r2=r2_score(y_test,y_pred)
d1["r2"]=r2

In [None]:
nminusone=len(x_test)-1
nminuspminus1=(len(x_test)-len(x.columns)-1)

In [None]:
adjusted_r2= 1 - ((1 - r2) * (nminusone) / (nminuspminus1))

In [None]:
d1["adjusted_r2"]=adjusted_r2

In [None]:
d1

## 7.0 To the Competition