In [70]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np

## Data Loading and Cleaning

In [71]:
df = pd.read_csv('/content/train.csv')

In [72]:
df_selected = df[['OverallCond','GrLivArea','GarageArea','TotalBsmtSF','YearBuilt','SalePrice','OverallQual']]

In [73]:
df_selected

Unnamed: 0,OverallCond,GrLivArea,GarageArea,TotalBsmtSF,YearBuilt,SalePrice,OverallQual
0,5,1710,548,856,2003,208500,7
1,8,1262,460,1262,1976,181500,6
2,5,1786,608,920,2001,223500,7
3,5,1717,642,756,1915,140000,7
4,5,2198,836,1145,2000,250000,8
...,...,...,...,...,...,...,...
1455,5,1647,460,953,1999,175000,6
1456,6,2073,500,1542,1978,210000,6
1457,9,2340,252,1152,1941,266500,7
1458,6,1078,240,1078,1950,142125,5


In [74]:
df1 = df_selected

In [75]:
df1

Unnamed: 0,OverallCond,GrLivArea,GarageArea,TotalBsmtSF,YearBuilt,SalePrice,OverallQual
0,5,1710,548,856,2003,208500,7
1,8,1262,460,1262,1976,181500,6
2,5,1786,608,920,2001,223500,7
3,5,1717,642,756,1915,140000,7
4,5,2198,836,1145,2000,250000,8
...,...,...,...,...,...,...,...
1455,5,1647,460,953,1999,175000,6
1456,6,2073,500,1542,1978,210000,6
1457,9,2340,252,1152,1941,266500,7
1458,6,1078,240,1078,1950,142125,5


In [76]:
# Define the mapping function
#mapping the OverallQual in 0 and 1
# 1 = Very Good Condition
# 0 = Not Good Condition
def map_values(value):
    if value in [1, 2, 3, 4, 5]:
        return 0
    elif value in [6, 7, 8, 9,10]:
        return 1
    else:
        return None  # or specify a default value for other cases

# Apply the mapping function to the column
df_selected['OverallQual'] = df_selected['OverallQual'].apply(map_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['OverallQual'] = df_selected['OverallQual'].apply(map_values)


In [77]:
df_selected['OverallQual'].value_counts()

1    922
0    538
Name: OverallQual, dtype: int64

## RandomForestClassfier Using DecisionTreeClassifier

In [78]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
import pandas as pd

In [79]:
#RandomForestClassifier Using the Decision tree classifier
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.estimators = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            # Bootstrap sample
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]

            # Create a decision tree and fit it on the bootstrap sample
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            tree.fit(X_bootstrap, y_bootstrap)

            # Add the trained tree to the list of estimators
            self.estimators.append(tree)

    def predict(self, X):
        predictions = np.zeros((X.shape[0], self.n_estimators))
        for i, tree in enumerate(self.estimators):
            predictions[:, i] = tree.predict(X)

        # Voting to determine the final prediction
        return np.apply_along_axis(lambda x: np.argmax(np.bincount(x.astype(int))), axis=1, arr=predictions)


In [80]:
# Split the data into training and test sets
X = df1[['GrLivArea', 'GarageArea', 'TotalBsmtSF','SalePrice','YearBuilt']]
y = df1['OverallQual']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
#changing the dataframe to the numpy arrays
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values

In [82]:
# Create an instance of RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)

# Fit the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Assuming you have X_test as your test data
predictions = rf_classifier.predict(X_test)


In [83]:
predictions

array([0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0])

In [84]:
 print("Classification Report: \n", classification_report(y_test, predictions))

Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.77      0.80       123
           1       0.84      0.89      0.86       169

    accuracy                           0.84       292
   macro avg       0.84      0.83      0.83       292
weighted avg       0.84      0.84      0.84       292



## RandomForestRegression Using DecisionTreeRegressor

### Normalization

In [85]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler




In [86]:
# Assuming your DataFrame is called 'df' and you have columns for normalization
columns_to_normalize = ['OverallCond','GrLivArea','GarageArea','TotalBsmtSF','YearBuilt','SalePrice','OverallQual']

# Create a new DataFrame with only the columns to be normalized
data_to_normalize = df1[columns_to_normalize]

# Apply normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data_to_normalize)

# Update the original DataFrame with the normalized values
df1[columns_to_normalize] = normalized_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[columns_to_normalize] = normalized_data


In [87]:
df1

Unnamed: 0,OverallCond,GrLivArea,GarageArea,TotalBsmtSF,YearBuilt,SalePrice,OverallQual
0,0.500,0.259231,0.386460,0.140098,0.949275,0.241078,1.0
1,0.875,0.174830,0.324401,0.206547,0.753623,0.203583,1.0
2,0.500,0.273549,0.428773,0.150573,0.934783,0.261908,1.0
3,0.500,0.260550,0.452750,0.123732,0.311594,0.145952,1.0
4,0.500,0.351168,0.589563,0.187398,0.927536,0.298709,1.0
...,...,...,...,...,...,...,...
1455,0.500,0.247362,0.324401,0.155974,0.920290,0.194556,1.0
1456,0.625,0.327619,0.352609,0.252373,0.768116,0.243161,1.0
1457,1.000,0.377920,0.177715,0.188543,0.500000,0.321622,1.0
1458,0.625,0.140166,0.169252,0.176432,0.565217,0.148903,0.0


### Modeling

In [88]:
class RandomForestRegressor:

    def __init__(self, n_estimators=100, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.estimators = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):

            # Bootstrap sample
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]

            # Create a decision tree and fit it on the bootstrap sample
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_bootstrap, y_bootstrap)

            # Add the trained tree to the list of estimators
            self.estimators.append(tree)

    def predict(self, X):
        predictions = np.zeros((X.shape[0], self.n_estimators))
        for i, tree in enumerate(self.estimators):
            predictions[:, i] = tree.predict(X)

        # Averaging the predictions to determine the final prediction
        return np.mean(predictions, axis=1)


In [89]:
# Split the data into training and test sets
X = df1[['GrLivArea', 'GarageArea', 'TotalBsmtSF','OverallQual','YearBuilt']]
y = df1['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [90]:
#changing the dataframe to the numpy arrays
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values

In [91]:
# Create an instance of RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

# Fit the regressor on the training data
rf_regressor.fit(X_train, y_train)

In [92]:
# performing predictions on the test dataset
predictions = rf_regressor.predict(X_test)

In [93]:
predictions

array([0.1443185 , 0.42440701, 0.10465769, 0.18032773, 0.29493256,
       0.06609429, 0.22450799, 0.15985974, 0.06588004, 0.11210735,
       0.15487005, 0.11126946, 0.13048604, 0.26462977, 0.20230662,
       0.12284926, 0.22045327, 0.13835544, 0.11525691, 0.22645258,
       0.20275309, 0.25550187, 0.19610262, 0.11217539, 0.23606207,
       0.19969393, 0.21543325, 0.09380171, 0.21014095, 0.2237123 ,
       0.11855645, 0.32069628, 0.28284822, 0.11463547, 0.30848603,
       0.15273504, 0.18976509, 0.24022441, 0.35293661, 0.09190251,
       0.13542425, 0.26353784, 0.11296639, 0.4986782 , 0.14350229,
       0.18722497, 0.12561172, 0.10838842, 0.51288568, 0.14503063,
       0.11999514, 0.25544454, 0.13041808, 0.35421408, 0.15229834,
       0.36412564, 0.21932465, 0.16049643, 0.16516873, 0.08104313,
       0.06297736, 0.15164699, 0.41000239, 0.3098367 , 0.37701853,
       0.28188585, 0.09935287, 0.40951832, 0.12400824, 0.19475135,
       0.12678092, 0.13615956, 0.12533073, 0.08476184, 0.59085

In [96]:
# Calculate evaluation metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)


# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


Mean Squared Error: 0.001978798851198477
Mean Absolute Error: 0.029482252281497104
R2 Score: 0.866225468960838
