# Predict Apartment Rent

In [1]:
# If additional packages are needed but are not installed by default, uncomment the last two lines of this cell
# and replace <package list> with a list of additional packages.
# This will ensure the notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [1]:
# Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.set_option("display.max_columns", 101)
pd.set_option('display.max_colwidth', 100)

## Data Description

Column | Description
:---|:---
`id` | Unique ID corresponding to the apartment
`age` | Age of the apartment
`commute_distance` | Distance of the nearest commute station from the apartment
`num_stores` | Number of stores near the apartment
`location` | Location of the locality represented as latitude and longitude separated by a delimiter
`outcome` | The predicted rent of the apartment (1 - High, 0 - Low)

In [2]:
# The information dataset for the training set is already loaded below
train = pd.read_csv('/Users/mohankarthikv/Documents/Data Analytics/train.csv')
train.head()

Unnamed: 0,id,age,commute_distance,num_stores,location,outcome
0,0,thirty-four,405.2134,1,"24.97349,121.53372",1.0
1,1,twelve,250.631,7,24.96606_121.54297,1.0
2,2,nine,279.1726,7,24.97528;121.54541,1.0
3,3,seventeen,350.8515,1,24.97544;121.53119,0.0
4,4,5,2408.993,0,24.95505;121.55964,0.0


In [3]:
test = pd.read_csv('/Users/mohankarthikv/Documents/Data Analytics/test.csv')
test.head()

Unnamed: 0,id,age,commute_distance,num_stores,location
0,320,fifteen,289.3248,5,24.98203_121.54348
1,321,nine,-491.0,0,24.98569_121.5276
2,322,0,-70.0,1,24.9748;121.53059
3,323,twenty-nine,535.527,8,24.98092;121.53653
4,324,eighteen,373.3937,8,24.9866;121.54082


In [4]:
train['age'] = pd.to_numeric(train['age'], errors='coerce')
train['commute_distance'] = pd.to_numeric(train['commute_distance'], errors='coerce')

test['age'] = pd.to_numeric(test['age'], errors='coerce')
test['commute_distance'] = pd.to_numeric(test['commute_distance'], errors='coerce')

def split_location(location):
    if pd.isna(location):
        return pd.Series([np.nan, np.nan])
    try:
        sanitized_location = location.replace(';', ',').replace('_', ',')
        parts = sanitized_location.split(',')
        lat, lon = float(parts[0]), float(parts[1])
        return pd.Series([lat, lon])
    except (IndexError, ValueError):
        return pd.Series([np.nan, np.nan])

    # parts = location.replace(';', ',').split(',')
    # return pd.Series([float(parts[0]), float(parts[1])])

if 'location' in train.columns:
    train[['latitude', 'longtitude']] = train['location'].apply(split_location)
    train = train.drop(columns=['location'])

if 'location' in test.columns:
    test[['latitude', 'longtitude']] = test['location'].apply(split_location)
    test = test.drop(columns=['location'])


# train = train.drop(columns=['location'])
# test = test.drop(columns=['location'])

train = train.fillna(method='ffill')
test = test.fillna(method='ffill')

X=train.drop(columns=['id', 'outcome'])
y=train['outcome']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(test.drop(columns=['id']))

## Machine Learning

Build a machine learning model that can predict the outcome.
- **The model's performance will be evaluated on the basis of Accuracy Score.**

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test.drop(columns=['id']))

In [7]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_imputed, y_train)

y_val_pred=model.predict(X_val_imputed)
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred)}')

Validation Accuracy: 0.796875


> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    Submit a CSV file with a header row plus each of the test entries, each on its own line. 

The file (`submissions.csv`) should have exactly 2 columns:

Column | Description
:---|:---
`id`  | Unique ID corresponding to the apartment
`outcome`  | The predicted rent of the apartment (1 - High, 0 - Low)

In [8]:
test = pd.read_csv('/Users/mohankarthikv/Documents/Data Analytics/test.csv')
test.head()

Unnamed: 0,id,age,commute_distance,num_stores,location
0,320,fifteen,289.3248,5,24.98203_121.54348
1,321,nine,-491.0,0,24.98569_121.5276
2,322,0,-70.0,1,24.9748;121.53059
3,323,twenty-nine,535.527,8,24.98092;121.53653
4,324,eighteen,373.3937,8,24.9866;121.54082


In [11]:
test_predictions = model.predict(X_test_imputed)
submission_df = pd.DataFrame({
    'id': test['id'],
    'outcome': test_predictions
})

In [12]:
#Submission
submission_df.to_csv('submissions.csv', index=False)