In [1]:
# !pip install word2number

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import math

from word2number import w2n
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('homeprices.csv')
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [4]:
median_bedrooms = math.floor(df.bedrooms.median())
median_bedrooms

4

In [5]:
df.bedrooms = df.bedrooms.fillna(median_bedrooms)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [6]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']], df.price)

LinearRegression()

In [7]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [8]:
reg.intercept_

221323.0018654043

In [9]:
reg.predict([[3000, 3, 40]])



array([498408.25158031])

In [10]:
(112.06244194*3000) + (23388.88007794*3) + (-3231.71790863*40) + 221323.00186540408

498408.2515740241

In [11]:
reg.predict([[2500, 4, 5]])



array([578876.03748933])

# Exercise

In [12]:
hiring_df = pd.read_csv('hiring.csv')
hiring_df.head()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


## Preprocessing

In [13]:
hiring_df.experience = hiring_df.experience.fillna('zero')
hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [14]:
exp = hiring_df.experience
print(exp)
for idx, word in enumerate(hiring_df.experience):
    num = w2n.word_to_num(word)
    exp[idx] = num
exp

0      zero
1      zero
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp[idx] = num


0     0
1     0
2     5
3     2
4     7
5     3
6    10
7    11
Name: experience, dtype: object

In [15]:
exp.astype(float)
hiring_df.experience = exp
hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [16]:
test_score_median = math.floor(hiring_df['test_score(out of 10)'].median())
test_score_median

8

In [17]:
hiring_df['test_score(out of 10)'] = hiring_df['test_score(out of 10)'].fillna(test_score_median)
hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [18]:
salary_reg = linear_model.LinearRegression()
salary_reg.fit(hiring_df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']],
              hiring_df['salary($)'])

LinearRegression()

In [19]:
salary_reg.predict([[2, 9, 1]])



array([42179.76710335])

In [20]:
salary_reg.predict([[12, 10, 10]])



array([92002.18340611])