In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [2]:
os.chdir('G:\\Machine learning\\Capstone final 1')

In [3]:
projects = pd.read_csv('projects.csv')
outcome = pd.read_csv('outcomes.csv')
essay = pd.read_csv('essays.csv')

In [4]:
data = pd.merge(left=projects,right=essay,on='projectid')

In [5]:
data = pd.merge(left=data,right=pd.DataFrame(outcome[['projectid','is_exciting']]),on='projectid',how='left')

In [6]:
del(projects,outcome,essay)

In [7]:
data.shape

(664098, 41)

In [8]:
def date(x):
    return datetime.strptime(x,'%Y-%m-%d')

In [9]:
data['new_date'] = data['date_posted'].apply(date)

In [10]:
# All these steps of data engineering are performed after examining the importance of data.
# All the intermediate steps are removed for easy readability.

In [10]:
data.drop(['school_longitude','schoolid','school_county','school_ncesid','school_latitude','school_zip','school_city','school_metro','school_district','secondary_focus_subject','secondary_focus_area','teacher_prefix','teacher_acctid_x','teacher_acctid_y','date_posted'],axis = 1,inplace=True)

In [11]:
data.dropna(subset = ['essay','need_statement','students_reached','fulfillment_labor_materials','primary_focus_subject','short_description','resource_type','grade_level','title'], inplace=True)

In [12]:
data.isnull().sum()

projectid                                     0
school_state                                  0
school_charter                                0
school_magnet                                 0
school_year_round                             0
school_nlns                                   0
school_kipp                                   0
school_charter_ready_promise                  0
teacher_teach_for_america                     0
teacher_ny_teaching_fellow                    0
primary_focus_subject                         0
primary_focus_area                            0
resource_type                                 0
poverty_level                                 0
grade_level                                   0
fulfillment_labor_materials                   0
total_price_excluding_optional_support        0
total_price_including_optional_support        0
students_reached                              0
eligible_double_your_impact_match             0
eligible_almost_home_match              

In [15]:
# Label encoding variables with binary values (True or False)

In [13]:
data['teacher_teach_for_america'] = (LabelEncoder().fit_transform(data['teacher_teach_for_america']))

In [14]:
data['teacher_ny_teaching_fellow'] = (LabelEncoder().fit_transform(data['teacher_ny_teaching_fellow']))

In [15]:
data['eligible_almost_home_match'] = (LabelEncoder().fit_transform(data['eligible_almost_home_match']))

In [16]:
data['eligible_double_your_impact_match'] = (LabelEncoder().fit_transform(data['eligible_double_your_impact_match']))

In [17]:
data['school_charter'] = (LabelEncoder().fit_transform(data['school_charter']))

In [18]:
data['school_magnet'] = (LabelEncoder().fit_transform(data['school_magnet']))

In [19]:
data['school_year_round'] = (LabelEncoder().fit_transform(data['school_year_round']))

In [20]:
data['school_nlns'] = (LabelEncoder().fit_transform(data['school_nlns']))

In [21]:
data['school_kipp'] = (LabelEncoder().fit_transform(data['school_kipp']))

In [22]:
data['school_charter_ready_promise'] = (LabelEncoder().fit_transform(data['school_charter_ready_promise']))

In [23]:
# Splitting test and train
split_date = datetime.strptime('2014-01-01' ,'%Y-%m-%d')

In [24]:
train = data[data['new_date'] < split_date]

In [25]:
test = data[data['new_date'] >= split_date]


In [26]:
train['is_exciting'] = (LabelEncoder().fit_transform(train['is_exciting']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
train = train.drop('new_date',axis=1)

In [28]:
test = test.drop('new_date',axis=1)

In [29]:
x_train = train.drop('is_exciting',axis=1)

In [30]:
y_train = train.drop(x_train,axis=1)

In [31]:
x_test = test.drop('is_exciting',axis=1)

In [32]:
del(data,train,test)

In [33]:
from scipy.sparse import hstack
from sklearn.feature_extraction import text
import sklearn.decomposition as decomp
import scipy

In [37]:
# Transform input data into desired form
def transform(a):
    ns1 = list(a['need_statement'])
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['student','students','My','need'])
    tf2 = text.TfidfVectorizer(ns1,ngram_range=(1,2),stop_words=my_stop_words,max_features=50)
    ns_tf = tf2.fit_transform(ns1)
    
    sd1 = list(a['short_description'])
    tf1 = text.TfidfVectorizer(sd1,ngram_range=(1,2),stop_words='english',max_features=50)
    sd_tf = tf1.fit_transform(sd1)
    
    title_ = list(a['title'])
    tf3 = text.TfidfVectorizer(title_,stop_words='english',max_features=25)
    tftext_title = tf3.fit_transform(title_)
    
       
    essay_ = list(a['essay'])
    tf4 = text.TfidfVectorizer(essay_,ngram_range=(1,2),stop_words='english',max_features=1000)
    tftext_essay = tf4.fit_transform(essay_)
    
#     deleting intermediate variables to free up space
    del(title_,ns1,sd1,essay_)
    
    
    l = a.drop(['need_statement','short_description','title','essay','projectid'],axis=1)
    l = pd.get_dummies(l)
    l = scipy.sparse.csr_matrix(l.values)
    
    text_out = hstack([l,ns_tf,sd_tf,tftext_title,tftext_essay])
    
    del(ns_tf,sd_tf,tftext_title,tftext_essay,l)
    
    return text_out

    

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [48]:
clf = RandomForestClassifier(n_estimators=500,oob_score=True,random_state=200)

In [51]:
auc_score_out = pd.DataFrame(columns=['kfold','auc_score'])
cv = KFold(n_splits=5, random_state=200, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []
    
for (train_i, test_i), i in zip(cv.split(x_train), range(5)):
#     clf.fit((np.array(x_train).iloc[train]).to_sparse(), (np.array(y_train).iloc[train]).to_sparse())
      X_train, X_test = x_train.iloc[train_i], x_train.iloc[test_i]
      Y_train, Y_test = y_train.iloc[train_i], y_train.iloc[test_i]
      X_train = transform(X_train)
      
      clf.fit(X_train,Y_train)
#      X_train = X_train.to_sparse()
#      Y_train = Y_train.to_sparse()
#      X_test = X_test.to_sparse()
#      Y_test = Y_test.to_sparse()
#      clf.fit(X_train,Y_train)
      _,_, auc_score_train = roc_auc_score(train_i)
      fpr, tpr, auc_score = roc_auc_score(test_i)
      scores.append((auc_score_train, auc_score))
      auc_score_out.append(i,auc_score)
      fprs.append(fpr)
      tprs.append(tpr)
 
plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

  if sys.path[0] == '':


NameError: name 'compute_roc_auc' is not defined

In [61]:
oob = clf.oob_score_
oob

0.9366102674414871

In [None]:
X_Test = transform(x_test)

In [59]:
clf.predict_proba(X_Test)

0.9363452680604785

In [95]:
Proba['Project_ID'] = pd.DataFrame(X_Test['projectid'])

In [95]:
Proba['is_exciting'] = pd.DataFrame(clf.predict_proba(X_Test)[:,1])

In [None]:
# Changing the threshold value for each observation.

Proba['is_exciting'] = Proba['is_exciting']*2.7

In [99]:
Proba.to_csv('Proba.csv',index = False)