# DonorsChoose.org (Kaggle Competition)

Donors Choose is an institution that supports financially teachers from public schools across United States with their projects. It receives hundreds of applications and has difficult to evaluate all of them. The objective of the work done in the competition is to create a machine learning model to process and evaluate applications automatically. It will save time of the volunteers that has to evaluate application, it is going to leverage faster evaluations and volunteers will be able to better assist teachers that are already with projects on the fly.

As most of the data are text data (as teachers write essays to be evaluated), I built a NLP model. For the predction first I tried Naive Bayes but the accuracy was not great, then I tried Random Forest which had a much better accuracy.
Due to computer resource restrictions, I ran the model with just a portion of the data.

In [1]:
# importing libraries
import pandas as pd
import numpy as np

#importing dataset
training_set = pd.read_csv('train.csv')
training_set_approved = training_set[training_set.project_is_approved == 1]
training_set_disapproved = training_set[training_set.project_is_approved == 0]

training_set = pd.concat([training_set_approved[:45000],training_set_disapproved[:45000]], axis=0)


test_set = pd.read_csv('test.csv')
resources = pd.read_csv('resources.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#grouping and merging datasets (including price of the resource dataset) and rearreging
group_resources = resources.groupby(by='id', as_index =False)['price'].agg('sum')
training_set_resources = training_set.merge(group_resources, on=['id'], how='left')
test_set_resources = test_set.merge(group_resources, on=['id'], how='left')
training_set_resources = training_set_resources[['id', 'teacher_id', 'teacher_prefix', 'school_state', 'project_submitted_datetime', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'project_title', 'project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4', 'project_resource_summary', 'teacher_number_of_previously_posted_projects', 'price', 'project_is_approved']]

In [3]:
#Function for cleaning text that is goint to be analyzed with NLP
def cleaning_long_texts(dataset, field):
    import re 
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    field_corpus = []
    
    print (field)
    
    for i in range(0, len(dataset)):
        field_text = re.sub('[^a-zA-Z]', ' ', dataset[field][i])
        field_text = field_text.lower()
        field_text = field_text.split()
        ps = PorterStemmer()
        field_text = [ps.stem(word) for word in field_text if not word in set(stopwords.words('english'))]
        field_text = ' '.join(field_text)
        field_corpus.append(field_text)
        #dataset[field][i] = cv.fit_transform
    
    return field_corpus

In [4]:
#cleaning texts
X_train_project_essay_1 = cleaning_long_texts(training_set_resources, 'project_essay_1')
X_train_project_essay_2 = cleaning_long_texts(training_set_resources, 'project_essay_2')
X_train_project_title = cleaning_long_texts(training_set_resources, 'project_title')
X_test_project_essay_1 = cleaning_long_texts(test_set_resources, 'project_essay_1')
X_test_project_essay_2 = cleaning_long_texts(test_set_resources, 'project_essay_2')
X_test_project_title = cleaning_long_texts(test_set_resources, 'project_title')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
project_essay_1
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
project_essay_2
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
project_title
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
project_essay_1
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
project_essay_2
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrato

In [5]:
#bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)    
X_train_project_essay_1 = pd.DataFrame(cv.fit_transform(X_train_project_essay_1).toarray())
X_train_project_essay_2 = pd.DataFrame(cv.fit_transform(X_train_project_essay_2).toarray())
X_train_project_title = pd.DataFrame(cv.fit_transform(X_train_project_title).toarray())
X_test_project_essay_1 = pd.DataFrame(cv.fit_transform(X_test_project_essay_1).toarray())
X_test_project_essay_2 = pd.DataFrame(cv.fit_transform(X_test_project_essay_2).toarray())
X_test_project_title = pd.DataFrame(cv.fit_transform(X_test_project_title).toarray())

In [6]:
#Adding vectorized text variables to dataframes
training_set_resources = pd.concat([training_set_resources,X_train_project_essay_1], axis=1)
training_set_resources = pd.concat([training_set_resources,X_train_project_essay_2], axis=1)
training_set_resources = pd.concat([training_set_resources,X_train_project_title], axis=1)

test_set_resources = pd.concat([test_set_resources,X_test_project_essay_1], axis=1)
test_set_resources = pd.concat([test_set_resources,X_test_project_essay_2], axis=1)
test_set_resources = pd.concat([test_set_resources,X_test_project_title], axis=1)

In [7]:
#Organizing dataframe
training_set_resources['project_is_approved_2'] = training_set_resources['project_is_approved']
del training_set_resources['project_is_approved']
del training_set_resources['project_essay_1']
del training_set_resources['project_essay_2']
del training_set_resources['project_essay_3']
del training_set_resources['project_essay_4']
del training_set_resources['project_resource_summary']
del training_set_resources['id']
del training_set_resources['teacher_id']
del training_set_resources['teacher_prefix']
del training_set_resources['project_submitted_datetime']
del training_set_resources['project_title']
del test_set_resources['project_essay_1']
del test_set_resources['project_essay_2']
del test_set_resources['project_essay_3']
del test_set_resources['project_essay_4']
del test_set_resources['project_resource_summary']
del test_set_resources['teacher_id']
del test_set_resources['teacher_prefix']
del test_set_resources['project_submitted_datetime']
del test_set_resources['project_title']

In [8]:
#independent and dependent variables
X = training_set_resources.iloc[:, :-1].values
y = training_set_resources.iloc[:, 9006].values
Prediction = test_set_resources.iloc[:, 1:9008].values

In [9]:
#treating categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X_state = LabelEncoder()
X[:,0] = labelencoder_X_state.fit_transform(X[:, 0])

labelencoder_X_grade_category = LabelEncoder()
X[:,1] = labelencoder_X_grade_category.fit_transform(X[:, 1])

labelencoder_X_subject_categories = LabelEncoder()
X[:,2] = labelencoder_X_subject_categories.fit_transform(X[:, 2])

labelencoder_X_subject_subcategories = LabelEncoder()
X[:,3] = labelencoder_X_subject_subcategories.fit_transform(X[:, 3])

labelencoder_X_test_state = LabelEncoder()
Prediction[:,0] = labelencoder_X_test_state.fit_transform(Prediction[:, 0])

labelencoder_X_test_grade_category = LabelEncoder()
Prediction[:,1] = labelencoder_X_test_grade_category.fit_transform(Prediction[:, 1])

labelencoder_X_test_subject_categories = LabelEncoder()
Prediction[:,2] = labelencoder_X_test_subject_categories.fit_transform(Prediction[:, 2])

labelencoder_X__test_subject_subcategories = LabelEncoder()
Prediction[:,3] = labelencoder_X__test_subject_subcategories.fit_transform(Prediction[:, 3])

In [10]:
#split training set and test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)



In [11]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Prediction = sc.fit_transform(Prediction)



In [12]:
# fiting random forest
from sklearn.ensemble import RandomForestClassifier
classifier_random = RandomForestClassifier(n_estimators=350, criterion='entropy', random_state=0, n_jobs=-1)
classifier_random.fit(X_train, y_train)

y_pred_random = classifier_random.predict(X_test)
predction_random = classifier_random.predict(Prediction)
predction_random_prob = classifier_random.predict_proba(Prediction)
predction_random_prob = pd.DataFrame(data=predction_random_prob, dtype=np.float32)

In [13]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_random = confusion_matrix(y_test, y_pred_random)
print (cm_random)

[[1541 4088]
 [ 605 8313]]


In [None]:
#k-fold --- naive bayes
from sklearn.model_selection import cross_val_score
accuracies_random = cross_val_score (estimator = classifier_random, X = X_train, y = y_train, cv=10)
accuracies_random.mean()
accuracies_random.std()

In [None]:
#writing result in csv file
final_result = pd.concat([test_set_resources['id'],predction_random_prob[1]], axis=1)
final_result['project_is_approved'] = final_result[1]
del final_result[1]
fname = "final_result.csv"
final_result.to_csv(fname, index=False)