In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load and Explore the Dataset
data = pd.read_csv('C:\\Users\\Hrishikesh\\Downloads\\data.csv')
print(data)
# Step 2: Data Preprocessing
# Drop irrelevant columns like street address
data.drop(['street'], axis=1, inplace=True)

# Convert date column to numerical representation (e.g., number of days since a certain date)
data['date'] = pd.to_datetime(data['date'])
data['days_since_date'] = (data['date'] - data['date'].min()).dt.days
data.drop(['date'], axis=1, inplace=True)

# Convert categorical variables like city into numerical representation
data = pd.get_dummies(data, columns=['city'])

# Separate features and target variable
X = data.drop('price', axis=1)
y = data['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify non-numeric columns
non_numeric_cols = X_train.select_dtypes(exclude=['float64', 'int64']).columns

# Remove non-numeric columns from training and testing sets
X_train_numeric = X_train.drop(columns=non_numeric_cols)
X_test_numeric = X_test.drop(columns=non_numeric_cols)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

# Step 3: Choose a Regression Model
model = LinearRegression()

# Step 4: Train the Model and Evaluate Performance
model.fit(X_train_scaled, y_train)
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Training RMSE: {train_rmse}')
print(f'Testing RMSE: {test_rmse}')
print(f'Training R^2 Score: {train_r2}')
print(f'Testing R^2 Score: {test_r2}')

# Step 5: Make Predictions
input_data = {
    'date':['02-05-2014'],
    'sqft_lot':[8000],
    'floors':[1],
    'waterfront':[0],
    'view':[0],
    'condition':[0],
    'sqft_above':[1200],
    'sqft_base':[200],
    'yr_built':[1950],
    'yr_renovated':[2005],
    'street':['18810 Densmore Ave N'],
    'city':['Boston'],
    'statezip':['WA 98133'],
    'country':['USA'],
    'bedrooms': [3],
    'bathrooms': [2],
    'sqft_living': [2000],
    'city_Boston': [1],  
}
input_df = pd.DataFrame(input_data)

# Preprocess input data
# Drop street address column
input_df.drop(['street'], axis=1, inplace=True)
# Convert date column to numerical representation
input_df['date'] = pd.to_datetime(input_df['date'])
input_df['days_since_date'] = (input_df['date'] - input_df['date'].min()).dt.days
input_df.drop(['date'], axis=1, inplace=True)
# Convert categorical variables like city into numerical representation
input_df = pd.get_dummies(input_df, columns=['city'])

# Scale input data
input_numeric = input_df.drop(columns=non_numeric_cols)
input_scaled = scaler.transform(input_numeric)

# Make predictions
predicted_price = model.predict(input_scaled)[0]
print(f'Predicted price: {predicted_price}')


                     date         price  bedrooms  bathrooms  sqft_living  \
0     2014-05-02 00:00:00  3.130000e+05       3.0       1.50         1340   
1     2014-05-02 00:00:00  2.384000e+06       5.0       2.50         3650   
2     2014-05-02 00:00:00  3.420000e+05       3.0       2.00         1930   
3     2014-05-02 00:00:00  4.200000e+05       3.0       2.25         2000   
4     2014-05-02 00:00:00  5.500000e+05       4.0       2.50         1940   
...                   ...           ...       ...        ...          ...   
4595  2014-07-09 00:00:00  3.081667e+05       3.0       1.75         1510   
4596  2014-07-09 00:00:00  5.343333e+05       3.0       2.50         1460   
4597  2014-07-09 00:00:00  4.169042e+05       3.0       2.50         3010   
4598  2014-07-10 00:00:00  2.034000e+05       4.0       2.00         2090   
4599  2014-07-10 00:00:00  2.206000e+05       3.0       2.50         1490   

      sqft_lot  floors  waterfront  view  condition  sqft_above  \
0       

KeyError: "['statezip' 'country' 'city_Algona' 'city_Auburn'\n 'city_Beaux Arts Village' 'city_Bellevue' 'city_Black Diamond'\n 'city_Bothell' 'city_Burien' 'city_Carnation' 'city_Clyde Hill'\n 'city_Covington' 'city_Des Moines' 'city_Duvall' 'city_Enumclaw'\n 'city_Fall City' 'city_Federal Way' 'city_Inglewood-Finn Hill'\n 'city_Issaquah' 'city_Kenmore' 'city_Kent' 'city_Kirkland'\n 'city_Lake Forest Park' 'city_Maple Valley' 'city_Medina'\n 'city_Mercer Island' 'city_Milton' 'city_Newcastle' 'city_Normandy Park'\n 'city_North Bend' 'city_Pacific' 'city_Preston' 'city_Ravensdale'\n 'city_Redmond' 'city_Renton' 'city_Sammamish' 'city_SeaTac'\n 'city_Seattle' 'city_Shoreline' 'city_Skykomish' 'city_Snoqualmie'\n 'city_Snoqualmie Pass' 'city_Tukwila' 'city_Vashon' 'city_Woodinville'\n 'city_Yarrow Point'] not found in axis"