In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Define parameters
data_root_path = 'D:\\DataSets\\KDD_Cup_2014\\Data\\'
projects_filename = "projects.csv"
resources_filename = 'resources.csv'
outcomes_filename = 'outcomes.csv'
sampleSubmission_filename = 'sampleSubmission.csv'
essays_filename = 'essays.csv'

data_key = 'projectid'

# Load the data set
print('Loading data set ......')
projects = pd.read_csv(data_root_path + projects_filename)
resources = pd.read_csv(data_root_path + resources_filename)
outcomes = pd.read_csv(data_root_path + outcomes_filename)
sample = pd.read_csv(data_root_path + sampleSubmission_filename)
essays = pd.read_csv(data_root_path + essays_filename)
print('Data set loaded !')



Loading data set ......
Data set loaded !


In [14]:
# sort the data based on id
projects = projects.sort_values(by=[data_key])
resources = resources.sort_values(by=[data_key])
outcomes = outcomes.sort_values(by=[data_key])
essays = essays.sort_values(by=[data_key])


# Split the training data and testing data
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]

In [43]:
# Fill in the missing values
projects = projects.fillna(method='pad')            # fill the missing hole with the previous observation data

# set the target labels
labels = np.array(outcomes.is_exciting)

#preprocessing the data based on different types of attr
projects_numeric_columns = ['school_latitude', 'school_longitude',
                            'fulfillment_labor_materials',
                            'total_price_excluding_optional_support',
                            'total_price_including_optional_support',
                           'students_reached']

projects_id_columns = ['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid']
columns_to_exclude = ['school_county', 'school_city', 'school_state', 'school_zip', 'date_posted']
projects_categorial_columns = np.array(list(set(projects.columns)
                                            .difference(set(projects_numeric_columns))
                                            .difference(set(projects_id_columns))
                                            .difference(set(columns_to_exclude))))

projects_categorial_values = np.array(projects[projects_categorial_columns])

print(projects_categorial_columns)
print(projects_categorial_columns.shape)
print(projects_categorial_values.shape)

['teacher_prefix' 'school_charter' 'school_district' 'school_year_round'
 'teacher_teach_for_america' 'primary_focus_subject' 'school_kipp'
 'secondary_focus_area' 'grade_level' 'eligible_almost_home_match'
 'poverty_level' 'school_magnet' 'eligible_double_your_impact_match'
 'school_metro' 'primary_focus_area' 'secondary_focus_subject'
 'resource_type' 'teacher_ny_teaching_fellow' 'school_nlns'
 'school_charter_ready_promise']
(20,)
(664098, 20)


In [44]:
projects_categorial_values[:5, :]

array([['Mrs.', 'f', 'Pershing Elem Network', 'f', 'f', 'Mathematics',
        'f', 'Music & The Arts', 'Grades PreK-2', 'f', 'highest poverty',
        'f', 'f', 'urban', 'Math & Science', 'Visual Arts', 'Supplies',
        'f', 'f', 'f'],
       ['Mrs.', 'f', 'Arvin Union School District', 'f', 'f', 'Literacy',
        'f', 'Literacy & Language', 'Grades PreK-2', 'f',
        'highest poverty', 'f', 't', 'urban', 'Literacy & Language',
        'Literature & Writing', 'Supplies', 'f', 'f', 'f'],
       ['Mr.', 'f', 'Arcadia Unified School Dist', 'f', 'f',
        'Literature & Writing', 'f', 'Literacy & Language', 'Grades 3-5',
        'f', 'moderate poverty', 'f', 'f', 'urban',
        'Literacy & Language', 'Literacy', 'Books', 'f', 'f', 'f'],
       ['Mrs.', 'f', 'Rsu 73', 'f', 'f', 'Literature & Writing', 'f',
        'Math & Science', 'Grades PreK-2', 'f', 'highest poverty', 'f',
        't', 'rural', 'Literacy & Language', 'Mathematics', 'Technology',
        'f', 'f', 'f'],
   

In [46]:
# One hot encoding
print('one hot encoding...')
enc = OneHotEncoder()
enc.fit(projects_categorial_values)
projects_data = enc.transform(projects_categorial_values)
print('The shape of the project data after one hot encoding', projects_data.shape)

one hot encoding...
The shape of the project data after one hot encoding (664098, 9356)


In [47]:
#Predicting
train = projects_data[train_idx]
test = projects_data[test_idx]
print('shape of test', test.shape)
clf = LogisticRegression()


clf.fit(train, labels=='t')
preds = clf.predict_proba(test)[:,1]
# preds = clf.predict(test)

#Save prediction into a file
sample['is_exciting'] = preds

shape of test (44772, 9356)




In [48]:
type(sample)

pandas.core.frame.DataFrame

In [49]:
sample.head()

Unnamed: 0,projectid,is_exciting
0,ffff7266778f71242675416e600b94e1,0.041802
1,fffeb510ee37a0bb01079f06bf141246,0.036259
2,fff979abefa35a6bdd133b4e4150b737,0.061422
3,fff8beec6de8c9411520d15d1f6979bf,0.135562
4,fff745e9c0b8cc9e73e8c4c9a0ef4292,0.031973


In [50]:
sample.to_csv(data_root_path + 'predictions.csv', index = False)

In [33]:
projects.head(10)

Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,...,resource_type,poverty_level,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted
148979,00001ccc0e81598c4bd86bacb94d7acb,96963218e74e10c3764a5cfb153e6fea,9f3f9f2c2da7edda5648ccd10554ed8c,170993000000.0,41.807654,-87.673257,Chicago,IL,60609.0,urban,...,Supplies,highest poverty,Grades PreK-2,30.0,1273.82,1498.61,31.0,f,f,2013-04-14
437277,00002bff514104264a6b798356fdd893,3414541eb63108700b188648f866f483,cbaae3265eda78d330cb8ab1a9217071,60327000000.0,35.203447,-118.840956,Arvin,CA,93203.0,urban,...,Supplies,highest poverty,Grades PreK-2,35.0,477.32,561.55,20.0,t,f,2010-09-08
405458,00002d691c05c51a5fdfbb2baef0ba25,7ad6abc974dd8b62773f79f6cbed48d5,56502bae9e97bab5eb54f9001878f469,60297000000.0,34.137997,-118.062795,Arcadia,CA,91007.0,urban,...,Books,moderate poverty,Grades 3-5,35.0,892.31,1049.78,250.0,f,f,2010-12-10
91352,0000b38bbc7252972f7984848cf58098,e1aa1ae5301d0cda860c4d9c89c24919,30fcfca739b17be54ce3f1ee46980340,231140000000.0,44.437717,-70.201292,Livermore,ME,4253.0,rural,...,Technology,highest poverty,Grades PreK-2,30.0,547.86,644.54,36.0,t,f,2013-09-27
49606,0000ee613c92ddc5298bf63142996a5c,e0c0a0214d3c2cfdc0ab6639bc3c5342,38bb0d62aa613c2f933de56c9df855b7,510126000000.0,38.851982,-77.145287,Falls Church,VA,22041.0,suburban,...,Technology,high poverty,Grades PreK-2,30.0,384.86,452.78,19.0,f,f,2013-12-11
255442,0000fa3aa8f6649abab23615b546016d,2a578595fe351e7fce057e048c409b18,3432ed3d4466fac2f2ead83ab354e333,64098010000.0,34.296596,-119.296596,Ventura,CA,93001.0,urban,...,Books,highest poverty,Grades 3-5,35.0,240.1,282.47,28.0,t,f,2012-04-07
189646,0000fb6aea57099cc5b051acb7f52a9e,ad51bb5eabffc738775887955421fe75,d4f02777656b5ee806965ae2186e0adb,470294000000.0,35.037663,-90.092321,Memphis,TN,38109.0,urban,...,Books,highest poverty,Grades 6-8,30.0,382.71,450.25,90.0,f,t,2012-11-17
616019,0001120447a33dd9ffeefa107ed04c43,a799e714a102967d674b258e5ea19231,c843a6322e90dc34304b60b43f4c2205,450258000000.0,34.571828,-80.615642,Kershaw,SC,29067.0,rural,...,Books,high poverty,Grades 6-8,17.0,296.0,360.98,35.0,f,f,2007-08-12
301504,0001146d343ea9452089d0e302496c06,3f71761d508f95684f2924763175dbe8,18e8fc522b79044cf70938cbefce41bb,450387000000.0,34.97755,-81.012395,Rock Hill,SC,29732.0,urban,...,Technology,high poverty,Grades 3-5,35.0,300.97,354.08,21.0,f,t,2011-12-08
511584,0001151477ea5349a0aa64ed1d83f0bc,f30b9edaea56bbade550e2f0da5db4f9,6cd638cff9af07d02c72bb1cc25612d5,261200000000.0,42.367172,-82.985527,Detroit,MI,48214.0,urban,...,Supplies,highest poverty,Grades 3-5,9.0,675.24,823.46,300.0,f,f,2009-08-28
