In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
print(train_data.shape)
print(test_data.shape)

(47439, 41)
(31626, 40)


In [4]:
#删除部分列
train_data.drop(['Id','Sold Price', 'Address','Elementary School','Middle School','High School','Zip','State','Annual tax amount','Parking features','Laundry features','Cooling features','Heating features'], axis=1, inplace=True)
test_data.drop(['Id', 'Address','Elementary School','Middle School','High School','Zip','State','Annual tax amount','Parking features','Laundry features','Cooling features','Heating features'], axis=1, inplace=True)
all_features = pd.concat((train_data, test_data))
all_features.head()

Unnamed: 0,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,Bathrooms,Full bathrooms,...,High School Score,High School Distance,Flooring,Appliances included,Tax assessed value,Listed On,Listed Price,Last Sold On,Last Sold Price,City
0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,"Ground Floor Bedroom, Master Bedroom on Ground...",0.0,,...,8.0,1.3,"Tile, Hardwood, Carpet","Dishwasher, Dryer, Garbage disposal, Microwave...",886486.0,2019-10-24,4198000.0,,,Los Altos
1,"HURRY, HURRY.......Great house 3 bed and 2 bat...",SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,3,2.0,2.0,...,2.0,1.3,,,505000.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles
2,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,SingleFamily,1958.0,Forced air,,0 spaces,9147.0,2,3.0,1.0,...,,10.1,,,49627.0,2019-08-25,180000.0,,,Strawberry
3,Rare 2-story Gated 5 bedroom Modern Mediterran...,SingleFamily,1947.0,Central,Central Air,"Detached Carport, Driveway, Garage - Two Door",,5,3.0,3.0,...,8.0,0.2,"Wood, Tile",Dishwasher,1775000.0,2019-10-24,1895000.0,2016-08-30,1500000.0,Culver City
4,Beautiful 200 acre ranch land with several pas...,VacantLand,,,,0 spaces,,,,,...,6.0,10.6,,,,2019-06-07,1595000.0,2016-06-27,900000.0,Creston


In [5]:
#将Summary转化为int
all_features.iloc[:,0] = all_features.iloc[:,0].apply(
    lambda x: len(str(x)))

In [6]:
#将日期转化为int
all_features['Listed On'] = all_features['Listed On'].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S.%f')
all_features['Last Sold On'] = all_features['Last Sold On'].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S.%f')

all_features['Listed On'] = all_features['Listed On'].apply(
    lambda x: (x.year-2000)*12 + x.month)
all_features['Last Sold On'] = all_features['Last Sold On'].apply(
    lambda x: (x.year-2000)*12 + x.month)

In [7]:
#将Bedrooms转化为int
def n_bedrooms(s):
    if type(s) is float:
        return s
    else:
        l = len(s)
        if l <= 2:
            return int(s)
        if l <= 21:
            return 1
        else: return int(l/22) + 1


all_features['Bedrooms'] = all_features['Bedrooms'].apply(
    lambda s: n_bedrooms(s))

In [8]:
#Heating,Cooling,Parking,Type,Flooring
def trans_1(s):
    if type(s) is float:
        return 'none'
    l = s.find(',')
    if l == -1:
        return s
    else:
        return s[:l].lower()
    
def trans_2(s):
    if type(s) is float:
        return s
    l = s.find(',')
    if l == -1:
        return s
    else:
        return s[:l].lower()
    
all_features['Heating'] = all_features['Heating'].apply(
    lambda s: trans_1(s))
all_features['Cooling'] = all_features['Cooling'].apply(
    lambda s: trans_1(s))
all_features['Parking'] = all_features['Parking'].apply(
    lambda s: trans_1(s))

all_features['Type'] = all_features['Type'].apply(
    lambda s: trans_2(s))
all_features['Flooring'] = all_features['Flooring'].apply(
    lambda s: trans_2(s))

In [9]:
#Appliances included转化为int
def trans_3(s):
    if type(s) is float:
        return s
    else:
        return s.count(',') + 1
    
all_features['Appliances included'] = all_features['Appliances included'].apply(
    lambda s: trans_3(s))

In [10]:
all_features

Unnamed: 0,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,Bathrooms,Full bathrooms,...,High School Score,High School Distance,Flooring,Appliances included,Tax assessed value,Listed On,Listed Price,Last Sold On,Last Sold Price,City
0,285,,1969.0,"heating - 2+ zones,","multi-zone,","garage,",1.0,5.0,0.0,,...,8.0,1.3,"tile,",6.0,886486.0,238,4198000.0,,,Los Altos
1,288,,1926.0,,"wall/window unit(s),","detached carport,",4047.0,3.0,2.0,2.0,...,2.0,1.3,,,505000.0,238,525000.0,236.0,328000.0,Los Angeles
2,1089,,1958.0,,none,,9147.0,2.0,3.0,1.0,...,,10.1,,,49627.0,236,180000.0,,,Strawberry
3,885,,1947.0,,,"detached carport,",,5.0,3.0,3.0,...,8.0,0.2,"wood,",1.0,1775000.0,238,1895000.0,200.0,1500000.0,Culver City
4,295,,,none,none,,,,,,...,6.0,10.6,,,,234,1595000.0,198.0,900000.0,Creston
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31621,512,,2019.0,,,"attached,",,5.0,6.0,,...,7.0,5.3,"mixed,",8.0,4819317.0,250,8500000.0,187.0,895500.0,Yuba City
31622,649,,1970.0,"forced air,",,"guest,",1626.0,2.0,2.0,1.0,...,,0.9,"laminate,",4.0,296819.0,248,330000.0,252.0,328000.0,Oceanside
31623,467,,,none,none,,,,,,...,5.0,3.7,,,,155,443000.0,,,La Mesa
31624,371,,1984.0,"fireplace(s),",,,,3.0,3.0,2.0,...,6.0,1.7,"laminate,",3.0,334277.0,247,445000.0,252.0,470000.0,San Diego


In [12]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79065 entries, 0 to 31625
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Summary                      79065 non-null  float64
 1   Type                         79065 non-null  object 
 2   Year built                   79065 non-null  float64
 3   Heating                      79065 non-null  object 
 4   Cooling                      79065 non-null  object 
 5   Parking                      79065 non-null  object 
 6   Lot                          79065 non-null  float64
 7   Bedrooms                     79065 non-null  float64
 8   Bathrooms                    79065 non-null  float64
 9   Full bathrooms               79065 non-null  float64
 10  Total interior livable area  79065 non-null  float64
 11  Total spaces                 79065 non-null  float64
 12  Garage spaces                79065 non-null  float64
 13  Region          

In [9]:
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(79065, 29368)

In [12]:
a = 'Dishwasher, Dryer, Garbage disposal, Microwave, Refrigerator, Washer'
a.find(',')

10