In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None

import statsmodels
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [168]:
df_train = pd.read_csv('train_(2)_(1)_(1).csv')
df_test = pd.read_csv('test_(2)_(1).csv')

In [170]:
df_test.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0
3,3,Built-up Area,Ready To Move,Jalahalli,2 BHK,,1000,2.0,0.0
4,4,Plot Area,Ready To Move,TC Palaya,1 Bedroom,,1350,1.0,0.0


In [172]:
df_test.dtypes

ID                int64
area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
dtype: object

In [174]:
df_test.describe()

Unnamed: 0,ID,bath,balcony
count,2664.0,2656.0,2559.0
mean,1331.5,2.700678,1.594764
std,769.174883,1.297112,0.800891
min,0.0,1.0,0.0
25%,665.75,2.0,1.0
50%,1331.5,2.0,2.0
75%,1997.25,3.0,2.0
max,2663.0,18.0,3.0


In [176]:
missing_values = df_test.isnull().sum()
missing_values

ID                 0
area_type          0
availability       0
location           0
size               2
society         1074
total_sqft         0
bath               8
balcony          105
dtype: int64

In [178]:
total = df_test.isnull().sum().sort_values(ascending = False)
percent = ((df_test.isnull().sum()/df_test.shape[0]*100))
percent = percent.sort_values(ascending =False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data

Unnamed: 0,Total,Percent
society,1074,40.315315
balcony,105,3.941441
bath,8,0.3003
size,2,0.075075
ID,0,0.0
area_type,0,0.0
availability,0,0.0
location,0,0.0
total_sqft,0,0.0


In [180]:
#need to convert area to num as its in object

def convert_sqft_to_num(x):
    try:
        if '-' in x: # range values split 
            nums = x.split('-')
            return (float(nums[0]) + float(nums[1])) / 2
        return float(x)
    except:
        return None

In [182]:
df_train['total_sqft'] = df_train['total_sqft'].apply(convert_sqft_to_num)
df_test['total_sqft'] = df_test['total_sqft'].apply(convert_sqft_to_num)

In [184]:
df_train['bath'].fillna(df_train['bath'].median(), inplace=True)
df_train['balcony'].fillna(0, inplace=True)
df_test['bath'].fillna(df_test['bath'].median(), inplace=True)
df_test['balcony'].fillna(0, inplace=True)

In [186]:
df_train['total_sqft'].fillna(df_train['total_sqft'].median(), inplace=True)
df_test['total_sqft'].fillna(df_test['total_sqft'].median(), inplace=True)

In [188]:
#catagorical to numerical
df_train_encoded = pd.get_dummies(df_train, columns=['area_type', 'location', 'size'], drop_first=True)
df_test_encoded = pd.get_dummies(df_test, columns=['area_type', 'location', 'size'], drop_first=True)

In [190]:
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

In [192]:
X_train = df_train_encoded.drop(['price', 'ID', 'society', 'availability'], axis=1)
y_train = df_train_encoded['price']

In [194]:
X_test = df_test_encoded.drop(['price', 'ID', 'society', 'availability'], axis=1)

In [196]:
model = LinearRegression()
model.fit(X_train, y_train)

In [197]:
test_predictions = model.predict(X_test)

In [198]:
df_test_encoded['price'] = test_predictions

In [200]:
print(df_test_encoded[['ID', 'price']])

        ID       price
0        0  165.100777
1        1   76.518275
2        2  110.320102
3        3   44.614849
4        4   48.848177
...    ...         ...
2659  2659  250.651294
2660  2660  361.688402
2661  2661   36.007866
2662  2662  301.320989
2663  2663   15.460788

[2664 rows x 2 columns]


In [204]:
df_test_encoded[['ID', 'price']].to_csv('test_prediction.csv', index=False)