# House Price Prediction by Kapil

A machine learning project to predict house prices using Python, Pandas, and scikit-learn.

**Internship:** ONLEI Technologies

---

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## 2. Load the Dataset

In [None]:
df = pd.read_csv('../data/train.csv')
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Shape and data types
print('Shape:', df.shape)
df.info()

In [None]:
# Check missing values
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Basic statistics
df.describe()

In [None]:
# Visualize SalePrice distribution
sns.histplot(df['SalePrice'], kde=True)
plt.title('SalePrice Distribution')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

## 4. Data Preprocessing

In [None]:
# Drop columns with too many missing values or not useful
df = df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], axis=1)

# Fill missing numerical values with median
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill missing categorical values with mode
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
# Encode categorical features
df = pd.get_dummies(df, drop_first=True)

## 5. Model Building

In [None]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

## 6. Model Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R^2 Score: {r2:.2f}')

In [None]:
# Plot actual vs predicted
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual SalePrice')
plt.ylabel('Predicted SalePrice')
plt.title('Actual vs Predicted SalePrice')
plt.show()

## 7. Next Steps
- Try other regression models (Random Forest, XGBoost, etc.)
- Hyperparameter tuning
- Build a simple Streamlit app
- Document everything in your GitHub README

Good luck and happy learning! 🚀