In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [2]:
apts = pd.read_csv("modifieddist_apartment_data.tsv", sep='\t')

In [3]:
def prep_data(data_df):
    
    """
    A function that cleans the data in a given data frame and splits it into a target and predictor set
    
    Args:
        data_df: a DataFrame
        
    Returns:
        X: the values of the DataFrame in which the Sex column has been changed from MALE and FEMALE to 0s and 1s, and
        from which the Species column has been dropped
        y: the values of the DataFrame consisting of only the Species column
    """
    
    #creates a copy of the DataFrame
    df = data_df.copy()
    le = preprocessing.LabelEncoder()
    
    #drops all the NaN values
    df = df[["bed_count","total_sqft","dist","laundry","furnished", "ac", "total_price", "built", "transitscore"]].dropna()
    
    #encodes the column Sex, Species, and Island as integers
    df['laundry'] = le.fit_transform(df['laundry'])
    df['furnished'] = le.fit_transform(df['furnished'])
    df['ac'] = le.fit_transform(df['ac'])
    
    df_copy = df.copy()

    #dividing our data into predictor and target datasets
    X = df.drop(['total_price'], axis = 1)
    y = df['total_price']
    
    return(X, y, df_copy)

In [69]:
# bed_count
# total_sqft
# dist
# laundry
# furnished
# ac
np.random.seed(1001) # 1001 - 90%
train, test = train_test_split(apts, test_size = 0.2) 

In [70]:
X_train, y_train, cleaned_train = prep_data(train)
X_test,  y_test, clean_test  = prep_data(test)

In [71]:
from sklearn.ensemble import RandomForestRegressor

In [88]:
# Create the regressor
regressor = RandomForestRegressor(max_depth = 8, n_estimators = 100, min_samples_split = 3)

# Train the regressor on the training data
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
score = regressor.score(X_test, y_test)

In [89]:
score 

0.9004302294592662