In [66]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [67]:
data = {
'SquareFeet': [650, 850, np.nan, 1100, 950, 1170, 980, np.nan, 700, 850, 1000, 960, 850],
'Bedrooms': [2, 3, 2, 3, np.nan, 4, 3, 4, 2, 3, 3, 2, 3],
'Age': [5, 7, 3, 10, 2, 12, 8, 4, 5, 6, 9, 11, 7],
'Price': [300000, 350000, 320000, 500000, 330000, 600000, 370000, 620000, 310000,340000, 400000, 360000, 350000]
}

In [68]:
df = pd.DataFrame(data)
# Fill missing values with the median
for column in ['SquareFeet', 'Bedrooms', 'Age']:
  df[column].fillna(df[column].median(), inplace=True)

In [69]:
df.drop_duplicates(inplace=True)

In [70]:
scaler = StandardScaler()
df[['SquareFeet', 'Bedrooms', 'Age']] = scaler.fit_transform(df[['SquareFeet', 'Bedrooms','Age']])

In [71]:
def detect_outliers_iqr(df, feature):
  Q1 = df[feature].quantile(0.25)
  Q3 = df[feature].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
  return outliers

In [74]:
outliers_squarefeet = detect_outliers_iqr(df, 'SquareFeet')
print("Outliers detected using IQR in 'SquareFeet':\n", outliers_squarefeet)

Outliers detected using IQR in 'SquareFeet':
 Empty DataFrame
Columns: [SquareFeet, Bedrooms, Age, Price]
Index: []


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [76]:
from sklearn.linear_model import LinearRegression
X = df[['SquareFeet', 'Bedrooms', 'Age']] # Independent variables
y = df['Price'] # Dependent variable
# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fitting the linear regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
# Making predictions and evaluating the model
y_pred = linear_reg.predict(X_test)
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Linear Regression RMSE: 53952.69437438349


In [80]:
from sklearn.preprocessing import PolynomialFeatures
# Transforming the features into polynomial features
poly = PolynomialFeatures(degree=2) # You can adjust the degree based on your analysis
X_poly = poly.fit_transform(X)
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2,random_state=42)
# Fitting the linear regression model on polynomial features
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
# Making predictions and evaluating the model
y_pred_poly = poly_reg.predict(X_test_poly)
print("Polynomial Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_poly)))

Polynomial Regression RMSE: 37736.92624527236
