In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear alg
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import copy
import math



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing the data
test=pd.read_csv("/kaggle/input/new-dataset/test.csv")
train=pd.read_csv("/kaggle/input/new-dataset/train.csv")

In [None]:
#Viewing train data
train.head()

In [None]:
#Viewing test data
test.head()

In [None]:
#Checking for the number of missing values in the train data
# Creates a new DataFrame 'train_df' from the dictionary 'train_df'
train_df = pd.DataFrame(train)
# Calculate missing values by summing up the number of NaN values in each column
missing_values = train_df.isnull().sum()


# Plotting
plt.figure(figsize=(10, 6))  # Set the figure size to 10x6 inches
plt.bar(missing_values.index, missing_values.values, color='skyblue')  # Create a bar plot
plt.xlabel('Columns')  # Label for the x-axis
plt.ylabel('Number of Missing Values')  # Label for the y-axis
plt.title('Missing Values in Dataset')  # Title of the plot
plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees for better readability
plt.tight_layout()  # Adjust subplot parameters to give specified padding
plt.show()  # Display the plot

In [None]:
#Checking for the number of missing values in the test data
# Creates a new DataFrame 'test_df' from the dictionary 'test_df'
test_df = pd.DataFrame(test)
# Calculate missing values by summing up the number of NaN values in each column
missing_values = test_df.isnull().sum()


# Plotting
plt.figure(figsize=(10, 6))  # Set the figure size to 10x6 inches
plt.bar(missing_values.index, missing_values.values, color='skyblue')  # Create a bar plot
plt.xlabel('Columns')  # Label for the x-axis
plt.ylabel('Number of Missing Values')  # Label for the y-axis
plt.title('Missing Values in Dataset')  # Title of the plot
plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees for better readability
plt.tight_layout()  # Adjust subplot parameters to give specified padding
plt.show()  # Display the plot

In [None]:
#One Hot encoding for Categorical Variables
# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# One-hot encoding for categorical variables
train_df= pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df= pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Align test set with training set columns
train_df,test_df=train_df.align(test_df, join='outer', axis=1, fill_value=0)


In [None]:
#Cleaning the train and test data by removing 'Nan' and 'Missing values'(Numerical)
train_mean_values = train_df.mean()  # Calculate mean for each column
test_mean_values = test_df.mean()

train_df.fillna(train_df.mean(), inplace=True)  # Replace NaNs in training data with mean
test_df.fillna(test_df.mean(), inplace=True)   # Replace NaNs in test data with mean 

In [None]:
#Seperating the data into X and Y
X=train_df.drop(columns=['SalePrice'])
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X= scaler.fit_transform(X)
Y=train_df['SalePrice']


In [None]:
#Initializing Random Weights and Bias
num_columns =X.shape[1]
w = np.random.rand(num_columns)
b = np.random.rand()

#The cost function
def costf(X, Y, w, b):
    m = X.shape[0]
    f_wb = np.dot(X, w) + b
    squared_error = np.sum((f_wb - Y) ** 2)
    cost = squared_error / (2 * m)
    return cost




In [None]:
def compute_gradient(X, Y, w, b):
    m = X.shape[0]
    n = X.shape[1]
    dj_dw = np.zeros(n)
    dj_db = 0.0



    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b  # Ensure correct dot product
        err = f_wb_i - Y.iloc[i]  # Access Y value for row i
        dj_dw += err * X[i]
        dj_db += err

    dj_dw /= m
    dj_db /= m

    return dj_dw, dj_db


In [None]:
import math

def gradient_descent(X, Y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):
    J_history = []
    w = np.copy(w_in)
    b = b_in

    m = X.shape[0]

    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(X, Y, w, b)

        w -= alpha * dj_dw
        b -= alpha * dj_db

        # Compute cost and store in history
        cost = cost_function(X, Y, w, b)
        J_history.append(cost)

        # Print cost every 100 iterations or at specified intervals
        if i % math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {cost:8.2f}")

    return w, b, J_history


In [None]:
# Assuming X and Y are already defined
num_features = X.shape[1]
initial_w = np.random.rand(num_features)
initial_b = np.random.rand()

alpha = 0.015
iterations = 100

# Run gradient descent
w_final, b_final, J_hist = gradient_descent(X, Y, initial_w, initial_b, costf, compute_gradient, alpha, iterations)

print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")

for i in range(5):
    print(f"prediction: {np.dot(X[i], w_final) + b_final:0.2f}, target value: {Y.iloc[i]}")


In [None]:
def predict(test_df, w_final, b_final, scaler):
    # Standardize test data using the same scaler as used for training
    X_test = test_df.drop(columns=['SalePrice'])
    X_test = scaler.transform(X_test)
    
    # Predict using the trained weights and bias
    predictions = np.dot(X_test, w_final) + b_final
    
    return predictions

test_ids = test_df['Id']
predictions=predict(test_df, w_final, b_final, scaler)

submission_df = pd.DataFrame({
    'Id':test_ids,
    'SalePrice': predictions
})

# Save the DataFrame to a CSV file (adjust the filename as needed)
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")