In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder

In [2]:
test = pd.read_csv('/SMBC-GREEN-DATA-Challenge/data/test-2.csv') 
train = pd.read_csv('/SMBC-GREEN-DATA-Challenge/data/train-2.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,...,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,...,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,...,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,...,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,...,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,...,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


In [4]:
train["created_at"] = pd.to_datetime(train["created_at"])

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19984 entries, 0 to 19983
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  19984 non-null  int64         
 1   created_at  19984 non-null  datetime64[ns]
 2   tree_dbh    19984 non-null  int64         
 3   curb_loc    19984 non-null  object        
 4   health      19984 non-null  int64         
 5   steward     5101 non-null   object        
 6   guards      5041 non-null   object        
 7   sidewalk    19984 non-null  object        
 8   user_type   19984 non-null  object        
 9   problems    7741 non-null   object        
 10  spc_common  19984 non-null  object        
 11  spc_latin   19984 non-null  object        
 12  nta         19984 non-null  object        
 13  nta_name    19984 non-null  object        
 14  borocode    19984 non-null  int64         
 15  boro_ct     19984 non-null  int64         
 16  boroname    19984 non-

In [6]:
data_all = pd.concat([train, test], sort=False).reset_index(drop=True)

In [7]:
#欠損値処理する前はHelpfulが最頻値だった
#Unsureがあるので、欠損値はUnsureにする
train["guards"].mode()

0    Helpful
Name: guards, dtype: object

In [8]:
data_all["guards"] = data_all["guards"].fillna("Unsure")

In [9]:
data_all["guards"].value_counts()

guards
Unsure     31000
Helpful     7366
Harmful     1320
Name: count, dtype: int64

In [10]:
guards_dummies = pd.get_dummies(train['guards'], prefix='guards', drop_first=False, dtype = int)
guards_dummies.head()

Unnamed: 0,guards_Harmful,guards_Helpful,guards_Unsure
0,0,0,0
1,0,1,0
2,0,0,0
3,0,0,0
4,0,0,0


In [11]:
data_all = pd.get_dummies(data_all, columns=['guards'], drop_first=False, dtype = int)

In [12]:
data_all = pd.get_dummies(data_all, columns=['sidewalk', 'curb_loc', 'user_type'], drop_first=False, dtype = int)

In [13]:
data_all.head()

Unnamed: 0.1,Unnamed: 0,created_at,tree_dbh,health,steward,problems,spc_common,spc_latin,nta,nta_name,...,guards_Harmful,guards_Helpful,guards_Unsure,sidewalk_Damage,sidewalk_NoDamage,curb_loc_OffsetFromCurb,curb_loc_OnCurb,user_type_NYC Parks Staff,user_type_TreesCount Staff,user_type_Volunteer
0,0,2015-06-29 00:00:00,14,1.0,,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,...,0,0,1,1,0,0,1,0,0,1
1,1,2016-09-21 00:00:00,5,1.0,3or4,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,...,0,1,0,0,1,0,1,0,0,1
2,2,2015-09-13 00:00:00,26,2.0,,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,...,0,0,1,0,1,0,1,0,0,1
3,3,2016-05-09 00:00:00,15,0.0,,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,...,0,0,1,1,0,0,1,1,0,0
4,4,2016-06-24 00:00:00,23,1.0,,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,...,0,0,1,0,1,0,1,0,0,1


In [14]:
data_all["created_at"] = pd.to_datetime(data_all["created_at"])

In [15]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  39686 non-null  int64         
 1   created_at                  39686 non-null  datetime64[ns]
 2   tree_dbh                    39686 non-null  int64         
 3   health                      19984 non-null  float64       
 4   steward                     10277 non-null  object        
 5   problems                    15398 non-null  object        
 6   spc_common                  39686 non-null  object        
 7   spc_latin                   39686 non-null  object        
 8   nta                         39686 non-null  object        
 9   nta_name                    39686 non-null  object        
 10  borocode                    39686 non-null  int64         
 11  boro_ct                     39686 non-null  int64     

In [16]:
data_all["steward"].value_counts()

steward
1or2       8164
3or4       2068
4orMore      45
Name: count, dtype: int64

In [17]:
train["steward"].value_counts()

steward
1or2       3999
3or4       1079
4orMore      23
Name: count, dtype: int64

In [18]:
data_all["problems"].value_counts()

problems
Stones                                  4455
BranchLights                            2045
StonesBranchLights                      1529
BranchOther                              793
RootOther                                717
                                        ... 
StonesTrunkLightsBranchLights             15
StonesRootOtherTrunkOtherBranchOther      15
RootOtherWiresRopeTrunkLights             12
TrunkLightsBranchLightsBranchOther        12
WiresRopeTrunkLightsBranchLights           9
Name: count, Length: 73, dtype: int64

In [19]:
train["problems"].value_counts()

problems
Stones                                  2219
BranchLights                            1036
StonesBranchLights                       762
BranchOther                              408
RootOther                                333
                                        ... 
StonesRootOtherTrunkOtherBranchOther       8
StonesRootOtherWiresRopeBranchOther        5
StonesMetalGratesTrunkOther                4
TrunkLightsBranchLightsBranchOther         4
WiresRopeTrunkLightsBranchLights           4
Name: count, Length: 73, dtype: int64

In [121]:
#stwardとproblemsは最頻値
data_all["steward"] = data_all["steward"].fillna(data_all["steward"].mode()[0])
data_all["problems"] = data_all["problems"].fillna(data_all["problems"].mode()[0])

In [122]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  39686 non-null  int64         
 1   created_at                  39686 non-null  datetime64[ns]
 2   tree_dbh                    39686 non-null  int64         
 3   health                      19984 non-null  float64       
 4   steward                     39686 non-null  object        
 5   problems                    39686 non-null  object        
 6   spc_common                  39686 non-null  object        
 7   spc_latin                   39686 non-null  object        
 8   nta                         39686 non-null  object        
 9   nta_name                    39686 non-null  object        
 10  borocode                    39686 non-null  int64         
 11  boro_ct                     39686 non-null  int64     

In [123]:
data_all["nta"].value_counts()

nta
SI54    718
SI11    678
BK82    632
BK45    622
SI01    616
       ... 
QN68      7
MN17      7
MN21      5
BK27      2
MN20      1
Name: count, Length: 187, dtype: int64

In [124]:
# s = 'abcde'
# print(s[:3])
# abc

In [125]:
data_all["re_nta"]  = data_all["nta"].str[:2]

In [126]:
data_all["re_nta"].value_counts()

re_nta
QN    13234
BK    10159
SI     7292
MN     4522
BX     4479
Name: count, dtype: int64

In [127]:
data_all = data_all.drop(["nta", "nta_name"], axis=1)

In [128]:
data_all = pd.get_dummies(data_all, columns=['re_nta'], drop_first=False, dtype = int)

In [129]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  39686 non-null  int64         
 1   created_at                  39686 non-null  datetime64[ns]
 2   tree_dbh                    39686 non-null  int64         
 3   health                      19984 non-null  float64       
 4   steward                     39686 non-null  object        
 5   problems                    39686 non-null  object        
 6   spc_common                  39686 non-null  object        
 7   spc_latin                   39686 non-null  object        
 8   borocode                    39686 non-null  int64         
 9   boro_ct                     39686 non-null  int64         
 10  boroname                    39686 non-null  object        
 11  zip_city                    39686 non-null  object    

In [130]:
data_all["spc_common"].value_counts()

spc_common
London planetree       4339
pin oak                4254
cherry                 3345
Japanese zelkova       3016
littleleaf linden      2598
                       ... 
European beech            6
Kentucky yellowwood       5
pond cypress              3
Chinese chestnut          3
Himalayan cedar           1
Name: count, Length: 120, dtype: int64

In [131]:
data_all["spc_latin"].value_counts()

spc_latin
Platanus x acerifolia    4339
Quercus palustris        4254
Prunus                   3345
Zelkova serrata          3016
Tilia cordata            2598
                         ... 
Fagus sylvatica             6
Cladrastis kentukea         5
Taxodium ascendens          3
Castanea mollissima         3
Cedrus deodara              1
Name: count, Length: 120, dtype: int64

In [132]:
data_all["boroname"].value_counts()

boroname
Queens           13234
Brooklyn         10159
Staten Island     7292
Manhattan         4522
Bronx             4479
Name: count, dtype: int64

In [133]:
# #re_nta
# QN    13234
# BK    10159
# SI     7292
# MN     4522
# BX     4479
# Name: count, dtype: int64

In [134]:
#boronameもre_ntaと同じため、削除する
data_all = data_all.drop(["boroname"], axis=1)

In [135]:
data_all["borocode"].value_counts()

borocode
4    13234
3    10159
5     7292
1     4522
2     4479
Name: count, dtype: int64

In [136]:
data_all = data_all.drop(["re_nta_QN", "re_nta_BK", "re_nta_SI", "re_nta_MN", "re_nta_BX"], axis=1)

In [137]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  39686 non-null  int64         
 1   created_at                  39686 non-null  datetime64[ns]
 2   tree_dbh                    39686 non-null  int64         
 3   health                      19984 non-null  float64       
 4   steward                     39686 non-null  object        
 5   problems                    39686 non-null  object        
 6   spc_common                  39686 non-null  object        
 7   spc_latin                   39686 non-null  object        
 8   borocode                    39686 non-null  int64         
 9   boro_ct                     39686 non-null  int64         
 10  zip_city                    39686 non-null  object        
 11  cb_num                      39686 non-null  int64     

In [138]:
data_all = data_all.drop(["zip_city"], axis=1)

In [139]:
data_all = data_all.drop(["cb_num", "st_senate", "st_assem", "cncldist"], axis=1)

In [140]:
data_all = pd.get_dummies(data_all, columns=['steward'], drop_first=False, dtype = int)

In [141]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  39686 non-null  int64         
 1   created_at                  39686 non-null  datetime64[ns]
 2   tree_dbh                    39686 non-null  int64         
 3   health                      19984 non-null  float64       
 4   problems                    39686 non-null  object        
 5   spc_common                  39686 non-null  object        
 6   spc_latin                   39686 non-null  object        
 7   borocode                    39686 non-null  int64         
 8   boro_ct                     39686 non-null  int64         
 9   guards_Harmful              39686 non-null  int64         
 10  guards_Helpful              39686 non-null  int64         
 11  guards_Unsure               39686 non-null  int64     

In [142]:
data_all["problems"].value_counts()

problems
Stones                                  28743
BranchLights                             2045
StonesBranchLights                       1529
BranchOther                               793
RootOther                                 717
                                        ...  
StonesTrunkLightsBranchLights              15
StonesRootOtherTrunkOtherBranchOther       15
RootOtherWiresRopeTrunkLights              12
TrunkLightsBranchLightsBranchOther         12
WiresRopeTrunkLightsBranchLights            9
Name: count, Length: 73, dtype: int64

In [143]:
data_without_NAME = data_all.drop(["spc_common", "spc_latin"], axis=1)
data_without_NAME = pd.get_dummies(data_without_NAME, columns=['problems'], drop_first=False, dtype = int)

In [144]:
data_without_NAME.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39686 entries, 0 to 39685
Data columns (total 92 columns):
 #   Column                                                              Non-Null Count  Dtype         
---  ------                                                              --------------  -----         
 0   Unnamed: 0                                                          39686 non-null  int64         
 1   created_at                                                          39686 non-null  datetime64[ns]
 2   tree_dbh                                                            39686 non-null  int64         
 3   health                                                              19984 non-null  float64       
 4   borocode                                                            39686 non-null  int64         
 5   boro_ct                                                             39686 non-null  int64         
 6   guards_Harmful                                        

In [149]:
data_without_NAME["year"] = data_without_NAME["created_at"].dt.year
data_without_NAME["month"] = data_without_NAME["created_at"].dt.month
data_without_NAME = data_without_NAME.drop(["created_at"], axis=1)

In [152]:
data_without_NAME = data_without_NAME.drop(["Unnamed: 0"], axis=1)

In [153]:
#trainデータとtestデータに分割
data_train = data_without_NAME[:len(train)]
data_test = data_without_NAME[len(train):].drop(["health"], axis=1)

In [155]:
data_without_NAME["year"].value_counts()

year
2015    30331
2016     9355
Name: count, dtype: int64

In [156]:
data_train.head()

Unnamed: 0,tree_dbh,health,borocode,boro_ct,guards_Harmful,guards_Helpful,guards_Unsure,sidewalk_Damage,sidewalk_NoDamage,curb_loc_OffsetFromCurb,...,problems_WiresRope,problems_WiresRopeBranchLights,problems_WiresRopeBranchOther,problems_WiresRopeTrunkLights,problems_WiresRopeTrunkLightsBranchLights,problems_WiresRopeTrunkOther,problems_WiresRopeTrunkOtherBranchLightsBranchOther,problems_WiresRopeTrunkOtherBranchOther,year,month
0,14,1.0,4,4152901,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,2015,6
1,5,1.0,2,2039901,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,2016,9
2,26,2.0,5,5017011,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,2015,9
3,15,0.0,5,5024401,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,2016,5
4,23,1.0,1,1022102,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,2016,6


In [157]:
y_train = data_train["health"]
X_train = data_train.drop(["health"], axis=1)
X_test = data_test

In [158]:
from sklearn.ensemble import RandomForestClassifier

#ランダムフォレストのパラメータ指定（デフォルト）
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_test = clf.predict(X_test)

In [159]:
result = pd.DataFrame({"Unnamed: 0": test["Unnamed: 0"], "health": y_test})

In [160]:
result["health"] = result["health"].astype(int)
result.to_csv("/SMBC-GREEN-DATA-Challenge/data/submission2.csv", index=False, header=False)

result.head()

Unnamed: 0.1,Unnamed: 0,health
0,19984,1
1,19985,1
2,19986,1
3,19987,1
4,19988,1
