## Importing Libraries 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

## Loading and Understanding your Dataset 

In [None]:
Dataset= pd.read_csv('C:/Users/kunal/Desktop/Ml Projects/Data sets/Real Estate dataset.csv')
Dataset.head()

Use the below commands to check wether your data is properly loaded or not.

In [None]:
Dataset.info() # Get the information of null values and datatypes of your attributes 

Dataset.describe() #  Get a description of your dataset and understand it.

Dataset[' CHAS'].value_counts() # If Your dataset contains any catagorical variable or any specific column 
                                # check how many different values it has.

To get a better understanding of your dataset visualize it.

In [None]:
%matplotlib inline
Dataset.hist(bins=50, figsize=(20,15))  # plot histogram with your desired number of Bins.
                                        # Histograms are better choice to understand the distribution of your data.

## Formation of Trainset and Testset  

Now you have a good overview over your data.

You understand what is the nature of your attributes and their importance.

In [None]:
from sklearn.model_selection import train_test_split 
train_set, test_set = train_test_split(Dataset, test_size=0.2, random_state=42) # Here we are using the train_test_split function.
                                                                                # Testsize explains what portion of data you want to use as test_set 
                                                                                # for general purposes we resrve 20 % data as a test you can choose whatever portion you want 
                                                                                # random_state fixes the selection of random values for training set 
                                                                                # otherwise if you run this function over multiple times 
                                                                                # your whole data can be covered as training set and model may overfit.

print(f"Rows in train set: {len(train_set)}\n Rows in test set: {len(test_set)}\n") 

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit                      # If your data contains catagorical variable and if it is important
split = StratifiedShuffleSplit(n_splits= 1, test_size= 0.2, random_state= 42)   # use stratified sampling.

For splitting based in specific column.

In [None]:
for train_index, test_index in split.split(Dataset, Dataset['Column']): #  choose your catagorical columns 
    strat_train_set = Dataset.loc[train_index]
    strat_test_set = Dataset.loc[test_index]

Check if the splitting is proper or not based on your data and your aim.

In [None]:
strat_test_set.info()
strat_train_set['Column'].value_counts() # Here i am checking if my catagorical variable is splitted properly.

## Correlarion Study

In [None]:
corr_matrix = Dataset.corr()                        # for finding the correlation between all the attributes 
corr_matrix['Lebel_coluumn'].sort_values(ascending= False)  # For finding the correlation wrt a specific attribute.

For better understanding try visualizing highly correlated fields.

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["Column1", "Column2", "Column3", "Column4"] # from the correlation study select the attributes.
scatter_matrix(Dataset[attributes], figsize= (12,8))


Replace your main dataset with training data set and seprate the predictive feature 

In [None]:
Dataset = strat_train_set.drop("Label_column", axis= 1)
Dataset_label = strat_train_set["Label_column"].copy()


Check if the two seprated sets are equal in values 

In [None]:
Dataset.info()
Dataset_label.info()

In [None]:
# Check the number of samples in both Dataset and Dataset_label
print(f'Dataset shape: {Dataset.shape}')
print(f'Dataset_label shape: {Dataset_label.shape}')

# Check if there are any missing values in the target variable before splitting
print(f'Missing values in MEDV before splitting: {strat_train_set["Label_column"].isnull().sum()}')

# Ensure consistent number of rows
if Dataset.shape[0] != Dataset_label.shape[0]:
    print("Mismatch in number of samples between features and target.")
else:
    print("Number of samples match between features and target.")


## Create Pipeline 

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler 
creating_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= "median")), # imputer class is used for dealing the missing values 
    #...add more functions if you want              # and the strategy used here is median you can choose mode or 0 if you want 
    #...
    ('std_scaler', StandardScaler()),
])
Dataset_num_tr = creating_pipeline.fit_transform(Dataset) # fit your data to the pipeline 
Dataset_num_tr.shape                                      # get info about your fitted data 

Import the models that you want to test 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor #you can use multiple models
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
# model = LinearRegression()
# model = DecisionTreeRegressor()
# model = RandomForestRegressor()
# model.fit(Dataset_num_tr, Dataset_label) # you can fit your  model over your trainning data by splitting the columns you want the prediction on  
# model.fit(Dataset_num_tr[:13], Dataset_num_tr[13:]) # or you can use this [from column : to column]

Predict and compare the predicted MSE(mean squared error) and RMSE values of different models and check which model is showing less error.

In [None]:
# from sklearn.metrics import mean_squared_error
# Dataset_predictions = model.predict(Dataset_num_tr)
# MSE = mean_squared_error(Dataset_label, Dataset_predictions)
# RMSE = np.sqrt(MSE)

## Cross Validation

Cross Validate your selected model and do iterations.

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, Dataset_num_tr, Dataset_label, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)
rmse_scores
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std()) 

Automated process for above iterations

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor()
}
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# Open a file to write the results
with open('model_results.txt', 'w') as file:
    for model_name, model in models.items():
        # Perform cross-validation
        scores = cross_val_score(model, Dataset_num_tr, Dataset_label, scoring="neg_mean_squared_error", cv=10)
        rmse_scores = np.sqrt(-scores)
        
        # Redirect print output to the file
        file.write(f"Results for {model_name}:\n")
        file.write(f"Scores: {rmse_scores}\n")
        file.write(f"Mean: {rmse_scores.mean()}\n")
        file.write(f"Standard deviation: {rmse_scores.std()}\n")
        file.write("\n")

        # Optionally, also print to console
        print(f"Results for {model_name}:")
        print_scores(rmse_scores)
        print("\n")

Choose the best suited model and fit over your training set

In [46]:
model = RandomForestRegressor()
model.fit(Dataset_num_tr, Dataset_label)

 1 model.fit(X, Y) # you can fit your  model over your trainning data by splitting the columns you want the prediction on  
 2 model.fit(housing_num_tr[:n], housing_num_tr[n:]) # or you can use this [from column : to column]
 
 as model.fit requires 2 argumnents so eighter you presplit your columns like option 1 or use option 2 

Try to predict some vlaues to check if model is working properly or not 

In [47]:
some_data = Dataset.iloc[:5]

In [49]:
some_lables = Dataset_label.iloc[:5]
prepared_data = creating_pipeline.transform(some_data)
model.predict(prepared_data)

some_lables

array([22.496, 25.88 , 16.822, 23.291, 23.487])

## Launching Your Model 


In [42]:
from joblib import dump, load
dump(model,'Predictor.joblib') # give name to your model ex- predictor

['Predictor.joblib']

## Model Testing 

In [43]:
X_test = strat_test_set.drop("Lable_column", axis=1)
Y_test = strat_test_set["Lable_column"].copy()
X_test_prepared = creating_pipeline.transform(X_test)

In [52]:
final_prediction = model.predict(X_test_prepared)
Final_mse = mean_squared_error(Y_test, final_prediction)
final_rmse = np.sqrt(Final_mse)
print(final_prediction, list(Y_test))


[25.359 11.841 25.469 21.635 17.788 14.985 20.276 14.295 31.944 41.341
 19.822 11.757 25.031 27.815 19.515 11.18  31.082 14.447 23.673 18.837
 19.991 17.403 19.063 22.29  18.095 31.644 16.459 32.388  9.139 33.858
 23.936 21.083 22.904 11.264 20.982 11.448 43.678 24.569 23.836 42.213
 24.261 29.5   20.093 20.824 18.604 32.86  44.855 20.143 20.36  21.771
 20.755 14.609 21.12  14.89  25.425 33.227 42.131 29.661 19.465 20.736
 46.738 10.207 18.984 24.725 14.705 33.009 19.043 17.516 18.802 34.54
 25.764 22.668 21.116 22.537 34.232 12.734 15.826 19.95  21.129 21.3
 22.63  20.858 14.003 22.674 20.801 21.441 14.047 20.889 22.021 23.365
 18.28  27.504  7.341 26.189 19.013 29.846 19.585 31.077 14.386 26.587
 21.628 20.554] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1, 4

# END

# understanding of different functions 

In [None]:
##def split_train_test(data, test_ratio):
 ##   np.random.seed(42)
  ##  shuffeled = np.random.permutation(len(data))
   ## test_size = int(len(data)* test_ratio)
   ## test_indicies = shuffeled[:test_size]
   ## train_indicies = shuffeled[test_size:]
##return data.iloc[test_indicies], data.iloc[train_indicies]
##test_set, train_set = split_train_test(Dataset, 0.2)
##print(f"Rows in train set: {len(train_set)}\n Rows in test set: {len(test_set)}\n")

To take care of missing atttributes, you have three options 
1 Get rid of the missing data points 
2 get rid of the whole attribute 
3 set the value to some value(0, mean or median)

In [None]:
##Dataset.dropna(subset= ["column"]) # option 1 
#Dataset.drop("column", axis=1) #option 2
#median = Dataset["column"].median()
#df.["column"].fillna(median) #option 3
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(strategy = "median")
#imputer.fit(Dataset)
#imputer.statistics_.shape
