In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
# Filepath to main training dataset.
train_file_path = 'train.csv'
# Filepath to properties file.
resources_file_path = 'resources.csv'

# Read data and store in DataFrame.
train_data = pd.read_csv(train_file_path, sep=',')
resources_data = pd.read_csv(resources_file_path, sep=',')

In [3]:
train_data.columns

Index(['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved'],
      dtype='object')

In [4]:
resources_data.columns

Index(['id', 'description', 'quantity', 'price'], dtype='object')

In [5]:
train_data.head(3)

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1
1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0
2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1


In [6]:
resources_data.head(3)

Unnamed: 0,id,description,quantity,price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45


In [7]:
train_data["id"]

0         p036502
1         p039565
2         p233823
3         p185307
4         p013780
5         p063374
6         p103285
7         p181781
8         p114989
9         p191410
10        p030093
11        p226941
12        p225747
13        p173555
14        p055350
15        p060293
16        p199435
17        p074849
18        p232007
19        p037127
20        p230221
21        p116615
22        p106275
23        p070918
24        p144291
25        p116102
26        p070029
27        p107356
28        p031939
29        p044085
           ...   
182050    p046158
182051    p117186
182052    p252101
182053    p135316
182054    p021728
182055    p054114
182056    p034326
182057    p149979
182058    p014629
182059    p212015
182060    p110625
182061    p053417
182062    p053967
182063    p226452
182064    p177332
182065    p030914
182066    p224932
182067    p245029
182068    p154340
182069    p235466
182070    p068185
182071    p248714
182072    p045565
182073    p078709
182074    

In [8]:
print(train_data.shape)
print(resources_data.shape)

(182080, 16)
(1541272, 4)


In [9]:
print("Duplicated") if True in train_data["id"].duplicated().tolist() else print("Non duplicated")
print("Duplicated") if True in resources_data["id"].duplicated().tolist() else print("Non duplicated")

Non duplicated
Duplicated


In [10]:
total_price = resources_data.quantity * resources_data.price
resources_data["total_price"] = total_price
resources_data

Unnamed: 0,id,description,quantity,price,total_price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.00,149.00
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95,44.85
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45,8.45
3,p069063,"Dixon Ticonderoga Wood-Cased #2 HB Pencils, Bo...",2,13.59,27.18
4,p069063,EDUCATIONAL INSIGHTS FLUORESCENT LIGHT FILTERS...,3,24.95,74.85
5,p069063,Last to Finish: A Story About the Smartest Boy...,1,16.99,16.99
6,p069063,"Mrs. Gorski, I Think I Have the Wiggle Fidgets...",1,9.95,9.95
7,p069063,"See-N-Read 1503905CQ Reading Tool - Book Size,...",2,10.11,20.22
8,p096795,"Brewster WPD90218 Wall Pops Flirt Dot, Set of ...",2,9.95,19.90
9,p096795,Brewster Wall Pops WPE99065 Peel & Stick Calyp...,2,9.02,18.04


In [11]:
#dropping possibly irrelevant columns
#train_data = train_data.drop([], )
resources_data = resources_data.drop(["description", "price"], axis=1)
train_data = train_data.drop(["teacher_id"], axis=1)

In [12]:
resources_data

Unnamed: 0,id,quantity,total_price
0,p233245,1,149.00
1,p069063,3,44.85
2,p069063,1,8.45
3,p069063,2,27.18
4,p069063,3,74.85
5,p069063,1,16.99
6,p069063,1,9.95
7,p069063,2,20.22
8,p096795,2,19.90
9,p096795,2,18.04


In [13]:
grouped_resources_data = resources_data.groupby("id", as_index=False, sort=False).sum()
grouped_resources_data

Unnamed: 0,id,quantity,total_price
0,p233245,1,149.00
1,p069063,13,202.49
2,p096795,44,238.34
3,p149007,252,1078.44
4,p236235,2,298.00
5,p052460,4,516.00
6,p233680,35,350.66
7,p245718,4,599.96
8,p171879,8,393.62
9,p163637,42,298.38


In [14]:
grouped_resources_data.shape


(260115, 3)

In [15]:
train_data.shape

(182080, 15)

In [16]:
cleaned_df = pd.merge(train_data, grouped_resources_data, how="inner", on=["id"])
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1,6,899.94
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0,20,400.00
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1,1,469.99
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0,5,684.47
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1,2,711.00
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,,,My students need tablets in order to communic...,0,1,7,727.36
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,,,My students need stability stools and inflatab...,1,1,6,414.02
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,,,My students need wiggle stools to allow them t...,0,1,6,414.78
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,,,My students need seating that allows the most ...,13,1,4,319.80
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,,,My students need 2 youth sized reclining chair...,12,1,2,119.76


In [17]:
cleaned_df.shape

(182080, 17)

In [18]:
cleaned_df[30:31]

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price
30,p081434,Ms.,NY,2016-12-06 21:19:44,Grades PreK-2,"Health & Sports, Special Needs","Health & Wellness, Special Needs",Seating Like a Boss- Our 21st Century Room,"\""Great job buddy!\"" is something I hear every...",In order to promote essential learning skills ...,,,My students need an opportunity to sit and wor...,9,0,11,233.27


In [19]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1,6,899.94
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0,20,400.00
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1,1,469.99
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0,5,684.47
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1,2,711.00
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,,,My students need tablets in order to communic...,0,1,7,727.36
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,,,My students need stability stools and inflatab...,1,1,6,414.02
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,,,My students need wiggle stools to allow them t...,0,1,6,414.78
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,,,My students need seating that allows the most ...,13,1,4,319.80
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,,,My students need 2 youth sized reclining chair...,12,1,2,119.76


In [20]:
cleaned_df.groupby("school_state").count()

Unnamed: 0_level_0,id,teacher_prefix,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price
school_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AK,557,557,557,557,557,557,557,557,557,14,14,557,557,557,557,557
AL,2955,2955,2955,2955,2955,2955,2955,2955,2955,84,84,2955,2955,2955,2955,2955
AR,1757,1757,1757,1757,1757,1757,1757,1757,1757,62,62,1757,1757,1757,1757,1757
AZ,3614,3614,3614,3614,3614,3614,3614,3614,3614,131,131,3614,3614,3614,3614,3614
CA,25695,25694,25695,25695,25695,25695,25695,25695,25695,879,879,25695,25695,25695,25695,25695
CO,1887,1887,1887,1887,1887,1887,1887,1887,1887,66,66,1887,1887,1887,1887,1887
CT,2766,2766,2766,2766,2766,2766,2766,2766,2766,88,88,2766,2766,2766,2766,2766
DC,902,902,902,902,902,902,902,902,902,30,30,902,902,902,902,902
DE,589,589,589,589,589,589,589,589,589,28,28,589,589,589,589,589
FL,10359,10359,10359,10359,10359,10359,10359,10359,10359,289,289,10359,10359,10359,10359,10359


In [21]:
cleaned_df.school_state.unique()

array(['NV', 'GA', 'UT', 'NC', 'CA', 'DE', 'MO', 'SC', 'IN', 'IL', 'VA',
       'PA', 'NY', 'FL', 'NJ', 'TX', 'LA', 'ID', 'OH', 'OR', 'MD', 'WA',
       'MA', 'KY', 'AZ', 'MI', 'CT', 'AR', 'WV', 'NM', 'WI', 'MN', 'OK',
       'AL', 'TN', 'IA', 'KS', 'CO', 'DC', 'WY', 'NH', 'HI', 'SD', 'MT',
       'MS', 'RI', 'VT', 'ME', 'NE', 'AK', 'ND'], dtype=object)

In [22]:
cleaned_df.teacher_prefix.unique()

array(['Ms.', 'Mrs.', 'Mr.', 'Teacher', 'Dr.', nan], dtype=object)

In [23]:
cleaned_df.project_grade_category.unique()

array(['Grades PreK-2', 'Grades 3-5', 'Grades 6-8', 'Grades 9-12'],
      dtype=object)

In [24]:
cleaned_df.project_subject_categories.unique()

array(['Literacy & Language', 'Music & The Arts, Health & Sports',
       'Math & Science, Literacy & Language', 'Health & Sports',
       'Applied Learning, Literacy & Language', 'Math & Science',
       'Literacy & Language, Math & Science', 'Special Needs',
       'Applied Learning, Special Needs',
       'Applied Learning, Music & The Arts',
       'Math & Science, Special Needs', 'Health & Sports, Special Needs',
       'Literacy & Language, History & Civics',
       'Health & Sports, Literacy & Language', 'Music & The Arts',
       'Literacy & Language, Special Needs',
       'Special Needs, Music & The Arts', 'History & Civics',
       'Health & Sports, Applied Learning', 'Applied Learning',
       'Math & Science, Applied Learning',
       'Literacy & Language, Music & The Arts',
       'Math & Science, Music & The Arts',
       'History & Civics, Special Needs',
       'History & Civics, Literacy & Language', 'Warmth, Care & Hunger',
       'Math & Science, History & Civics',


In [25]:
cleaned_df.project_subject_subcategories.unique()

array(['Literacy', 'Performing Arts, Team Sports',
       'Applied Sciences, Literature & Writing', 'Health & Wellness',
       'Character Education, Literature & Writing',
       'Early Development, Literature & Writing', 'Mathematics',
       'Literature & Writing, Mathematics', 'Literacy, Mathematics',
       'Character Education, Literacy', 'Literature & Writing',
       'Special Needs', 'ESL, Mathematics',
       'Health & Life Science, Mathematics',
       'College & Career Prep, Literature & Writing',
       'Early Development, Special Needs', 'Applied Sciences',
       'Early Development, Performing Arts', 'Mathematics, Special Needs',
       'Environmental Science, Literature & Writing',
       'Health & Wellness, Special Needs', 'Other, Special Needs',
       'Literacy, Social Sciences',
       'Health & Wellness, Literature & Writing', 'Music',
       'Literacy, Special Needs', 'Applied Sciences, Mathematics',
       'ESL, Literacy', 'Applied Sciences, Special Needs',
      

In [26]:
cleaned_df[['category_1','category_2','category_3']] = cleaned_df['project_subject_categories'].str.split(',', 3, expand=True)

In [27]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price,category_1,category_2,category_3
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1,6,899.94,Literacy & Language,,
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0,20,400.00,Music & The Arts,Health & Sports,
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1,1,469.99,Math & Science,Literacy & Language,
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0,5,684.47,Health & Sports,,
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1,2,711.00,Health & Sports,,
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,,,My students need tablets in order to communic...,0,1,7,727.36,Applied Learning,Literacy & Language,
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,,,My students need stability stools and inflatab...,1,1,6,414.02,Health & Sports,,
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,,,My students need wiggle stools to allow them t...,0,1,6,414.78,Applied Learning,Literacy & Language,
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,,,My students need seating that allows the most ...,13,1,4,319.80,Math & Science,,
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,,,My students need 2 youth sized reclining chair...,12,1,2,119.76,Literacy & Language,,


In [28]:
cleaned_df.category_1.unique()

array(['Literacy & Language', 'Music & The Arts', 'Math & Science',
       'Health & Sports', 'Applied Learning', 'Special Needs',
       'History & Civics', 'Warmth'], dtype=object)

In [29]:
cleaned_df.category_2.unique()

array([None, ' Health & Sports', ' Literacy & Language',
       ' Math & Science', ' Special Needs', ' Music & The Arts',
       ' History & Civics', ' Applied Learning', ' Care & Hunger',
       ' Warmth'], dtype=object)

In [30]:
cleaned_df.category_3.unique()

array([None, ' Care & Hunger'], dtype=object)

In [31]:
cleaned_df["category_1"] = cleaned_df["category_1"].str.strip()
cleaned_df["category_2"] = cleaned_df["category_2"].str.strip()
cleaned_df["category_3"] = cleaned_df["category_3"].str.strip()

In [32]:
cleaned_df.category_1.unique()

array(['Literacy & Language', 'Music & The Arts', 'Math & Science',
       'Health & Sports', 'Applied Learning', 'Special Needs',
       'History & Civics', 'Warmth'], dtype=object)

In [33]:
cleaned_df.category_2.unique()

array([None, 'Health & Sports', 'Literacy & Language', 'Math & Science',
       'Special Needs', 'Music & The Arts', 'History & Civics',
       'Applied Learning', 'Care & Hunger', 'Warmth'], dtype=object)

In [34]:
cleaned_df.category_3.unique()

array([None, 'Care & Hunger'], dtype=object)

In [35]:
cleaned_df.total_price.max()

15299.69

In [36]:
cleaned_df.total_price.min()

100.0

In [37]:
cleaned_df["total_price_category"] = pd.cut(
    cleaned_df["total_price"], 
    bins=[0,100,250,500,1000,16000], 
    labels=["0-100","101-250","251-500","501-1000",">1000"]
)

In [38]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,...,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price,category_1,category_2,category_3,total_price_category
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,...,,My students need 6 Ipod Nano's to create and d...,26,1,6,899.94,Literacy & Language,,,501-1000
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,...,,My students need matching shirts to wear for d...,1,0,20,400.00,Music & The Arts,Health & Sports,,251-500
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,...,,My students need the 3doodler. We are an SEM s...,5,1,1,469.99,Math & Science,Literacy & Language,,251-500
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",...,,My students need balls and other activity equi...,16,0,5,684.47,Health & Sports,,,501-1000
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,...,,My students need a water filtration system for...,42,1,2,711.00,Health & Sports,,,501-1000
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,...,,My students need tablets in order to communic...,0,1,7,727.36,Applied Learning,Literacy & Language,,501-1000
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,...,,My students need stability stools and inflatab...,1,1,6,414.02,Health & Sports,,,251-500
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,...,,My students need wiggle stools to allow them t...,0,1,6,414.78,Applied Learning,Literacy & Language,,251-500
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,...,,My students need seating that allows the most ...,13,1,4,319.80,Math & Science,,,251-500
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,...,,My students need 2 youth sized reclining chair...,12,1,2,119.76,Literacy & Language,,,101-250


In [39]:
cleaned_df["n_previous_projects"] = pd.cut(
    cleaned_df["teacher_number_of_previously_posted_projects"],
    bins=[-1,1,5,10,25,50,500],
    labels=['0-1','2-5','6-10','11-25','26-50','51+']
)

In [40]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,...,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,quantity,total_price,category_1,category_2,category_3,total_price_category,n_previous_projects
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,...,My students need 6 Ipod Nano's to create and d...,26,1,6,899.94,Literacy & Language,,,501-1000,26-50
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,...,My students need matching shirts to wear for d...,1,0,20,400.00,Music & The Arts,Health & Sports,,251-500,0-1
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,...,My students need the 3doodler. We are an SEM s...,5,1,1,469.99,Math & Science,Literacy & Language,,251-500,2-5
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",...,My students need balls and other activity equi...,16,0,5,684.47,Health & Sports,,,501-1000,11-25
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,...,My students need a water filtration system for...,42,1,2,711.00,Health & Sports,,,501-1000,26-50
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,...,My students need tablets in order to communic...,0,1,7,727.36,Applied Learning,Literacy & Language,,501-1000,0-1
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,...,My students need stability stools and inflatab...,1,1,6,414.02,Health & Sports,,,251-500,0-1
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,...,My students need wiggle stools to allow them t...,0,1,6,414.78,Applied Learning,Literacy & Language,,251-500,0-1
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,...,My students need seating that allows the most ...,13,1,4,319.80,Math & Science,,,251-500,11-25
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,...,My students need 2 youth sized reclining chair...,12,1,2,119.76,Literacy & Language,,,101-250,11-25


In [41]:
cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter

In [42]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,...,project_is_approved,quantity,total_price,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,...,1,6,899.94,Literacy & Language,,,501-1000,26-50,11,4
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,...,0,20,400.00,Music & The Arts,Health & Sports,,251-500,0-1,4,2
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,...,1,1,469.99,Math & Science,Literacy & Language,,251-500,2-5,1,1
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",...,0,5,684.47,Health & Sports,,,501-1000,11-25,8,3
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,...,1,2,711.00,Health & Sports,,,501-1000,26-50,8,3
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,...,1,7,727.36,Applied Learning,Literacy & Language,,501-1000,0-1,11,4
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,...,1,6,414.02,Health & Sports,,,251-500,0-1,8,3
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,...,1,6,414.78,Applied Learning,Literacy & Language,,251-500,0-1,8,3
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,...,1,4,319.80,Math & Science,,,251-500,11-25,9,3
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,...,1,2,119.76,Literacy & Language,,,101-250,11-25,9,3


In [43]:
cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")

In [44]:
cleaned_df["project_essay_3"]

0                                                          
1                                                          
2                                                          
3                                                          
4                                                          
5                                                          
6                                                          
7                                                          
8                                                          
9                                                          
10                                                         
11                                                         
12                                                         
13                                                         
14                                                         
15                                                         
16                                      

In [45]:
cleaned_df["project_essay_1"]

0         Most of my kindergarten students come from low...
1         Our elementary school is a culturally rich sch...
2         Hello;\r\nMy name is Mrs. Brotherton. I teach ...
3         My students are the greatest students but are ...
4         My students are athletes and students who are ...
5         My kids tell me each day that they want to mak...
6         Kindergarten is the new first grade. My studen...
7         First graders are fantastic! They are excited ...
8         My seventh graders dream big. They can't wait ...
9         I teach first grade in a small farming town in...
10        My classroom has 24 students in it. We have an...
11        My children come to school everyday with the s...
12        My school is located in a high poverty area, j...
13        Each day my fifth graders walk into our \"home...
14        The children at our school come from a variety...
15        My third graders are eager to learn new concep...
16        My students are Hispanics, Spa

In [46]:
cleaned_df["project_essay_1"][0]

'Most of my kindergarten students come from low-income households and are considered \\"at-risk\\". These kids walk to school alongside their parents and most have never been further than walking distance from their house. For 80% of my students, English is not their first language or the language spoken at home. \\r\\n\\r\\nWhile my kindergarten kids have many obstacles in front of them, they come to school each day excited and ready to learn. Most students started the year out never being in a school setting. At the start of the year many had never been exposed to letters. Each day they soak up more knowledge and try their hardest to succeed. They are highly motivated to learn new things every day. We are halfway through the year and they are starting to take off. They know know all letters, some sight words, numbers to 20, and a majority of their letter sounds because of their hard work and determination. I am excited to see the places we will go from here!'

In [47]:
cleaned_df["project_essay_3"][0]

''

In [48]:
cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)

In [49]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,...,quantity,total_price,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p036502,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,...,6,899.94,Literacy & Language,,,501-1000,26-50,11,4,Super Sight Word Centers Most of my kindergart...
1,p039565,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,...,20,400.00,Music & The Arts,Health & Sports,,251-500,0-1,4,2,Keep Calm and Dance On Our elementary school i...
2,p233823,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,...,1,469.99,Math & Science,Literacy & Language,,251-500,2-5,1,1,Lets 3Doodle to Learn Hello;\r\nMy name is Mrs...
3,p185307,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",...,5,684.47,Health & Sports,,,501-1000,11-25,8,3,"\""Kid Inspired\"" Equipment to Increase Activit..."
4,p013780,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,...,2,711.00,Health & Sports,,,501-1000,26-50,8,3,We need clean water for our culinary arts clas...
5,p063374,Mrs.,DE,2016-11-05 10:01:51,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,...,7,727.36,Applied Learning,Literacy & Language,,501-1000,0-1,11,4,Need to Reach Our Virtual Mentors!!! My kids ...
6,p103285,Mrs.,MO,2016-08-31 00:30:43,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,...,6,414.02,Health & Sports,,,251-500,0-1,8,3,Active Kindergartners Kindergarten is the new ...
7,p181781,Mrs.,SC,2016-08-03 13:26:01,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,...,6,414.78,Applied Learning,Literacy & Language,,251-500,0-1,8,3,Fabulous Firsties-Wiggling to Learn! First gra...
8,p114989,Ms.,IN,2016-09-13 22:35:57,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,...,4,319.80,Math & Science,,,251-500,11-25,9,3,Wobble Chairs Help Fidgety Kids Focus My seven...
9,p191410,Mrs.,IL,2016-09-24 18:38:59,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,...,2,119.76,Literacy & Language,,,101-250,11-25,9,3,Snuggle Up With A Good Book I teach first grad...


In [50]:
cleaned_df["merged_essays"][0]

'Super Sight Word Centers Most of my kindergarten students come from low-income households and are considered \\"at-risk\\". These kids walk to school alongside their parents and most have never been further than walking distance from their house. For 80% of my students, English is not their first language or the language spoken at home. \\r\\n\\r\\nWhile my kindergarten kids have many obstacles in front of them, they come to school each day excited and ready to learn. Most students started the year out never being in a school setting. At the start of the year many had never been exposed to letters. Each day they soak up more knowledge and try their hardest to succeed. They are highly motivated to learn new things every day. We are halfway through the year and they are starting to take off. They know know all letters, some sight words, numbers to 20, and a majority of their letter sounds because of their hard work and determination. I am excited to see the places we will go from here! 

In [51]:
cleaned_df["merged_essays"][18]

'Watch Readers Grow! During our reading workshop students are at daily 5. My students need activities to help them practice skills in a fun and enjoyable way that is on the level of each child. As the teacher I enjoy conferencing with each student, so the more engaged the students are practicing the skills they. My students lack confidence. I have a class with such great potential. My students need more hands on learning and extra practice to catch and grow a love for reading. My second graders love to learn. We have resources but most are out dated. My students would be so bright if they could only build confidence. I believe in them and now I want to see them bloom. During our reading workshop. We do mini lessons that focus on a skill and then they rotate through daily 5 (centers). These reading activities will help them apply what they learned and reinforce the skills. They rotate through self read, buddy read (carpet) listening, writing,  word work (reading skills). If the class is

In [52]:
cleaned_df = cleaned_df.drop([
    "project_submitted_datetime", 
    "project_essay_1", 
    "project_essay_2", 
    "project_essay_3", 
    "project_essay_4",
    "quantity",
    "total_price",
    "teacher_number_of_previously_posted_projects"], 
    axis=1
)

In [53]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p036502,Ms.,NV,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,My students need 6 Ipod Nano's to create and d...,1,Literacy & Language,,,501-1000,26-50,11,4,Super Sight Word Centers Most of my kindergart...
1,p039565,Mrs.,GA,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,My students need matching shirts to wear for d...,0,Music & The Arts,Health & Sports,,251-500,0-1,4,2,Keep Calm and Dance On Our elementary school i...
2,p233823,Ms.,UT,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,My students need the 3doodler. We are an SEM s...,1,Math & Science,Literacy & Language,,251-500,2-5,1,1,Lets 3Doodle to Learn Hello;\r\nMy name is Mrs...
3,p185307,Mr.,NC,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students need balls and other activity equi...,0,Health & Sports,,,501-1000,11-25,8,3,"\""Kid Inspired\"" Equipment to Increase Activit..."
4,p013780,Mr.,CA,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students need a water filtration system for...,1,Health & Sports,,,501-1000,26-50,8,3,We need clean water for our culinary arts clas...
5,p063374,Mrs.,DE,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My students need tablets in order to communic...,1,Applied Learning,Literacy & Language,,501-1000,0-1,11,4,Need to Reach Our Virtual Mentors!!! My kids ...
6,p103285,Mrs.,MO,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,My students need stability stools and inflatab...,1,Health & Sports,,,251-500,0-1,8,3,Active Kindergartners Kindergarten is the new ...
7,p181781,Mrs.,SC,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,My students need wiggle stools to allow them t...,1,Applied Learning,Literacy & Language,,251-500,0-1,8,3,Fabulous Firsties-Wiggling to Learn! First gra...
8,p114989,Ms.,IN,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My students need seating that allows the most ...,1,Math & Science,,,251-500,11-25,9,3,Wobble Chairs Help Fidgety Kids Focus My seven...
9,p191410,Mrs.,IL,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,My students need 2 youth sized reclining chair...,1,Literacy & Language,,,101-250,11-25,9,3,Snuggle Up With A Good Book I teach first grad...


In [54]:
cleaned_df["project_resource_summary"][2050]

'My students need a balance ball chair, a yoga mat, tape, folders, and a variety of books, videos and other activities centered on yoga and mindfulness.'

In [55]:
cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")

In [56]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *

In [57]:
#filtered_words = [word for word in cleaned_df["merged_essays"][0].split(" ") if word not in stopwords.words('english')]
#filtered_words

test = "\"Hello\" guys! How are you? | This is the best text ever written. :D DD (-.-) (*o*)"
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(test.lower()))
print([word for word in tokenizer.tokenize(test.lower()) if word not in stopwords.words('english')])

#print([word for word in tokenizer.tokenize(cleaned_df["merged_essays"][0].lower()) if word not in stopwords.words('english')])

['hello', 'guys', 'how', 'are', 'you', 'this', 'is', 'the', 'best', 'text', 'ever', 'written', 'd', 'dd', 'o']
['hello', 'guys', 'best', 'text', 'ever', 'written', 'dd']


In [58]:
len(cleaned_df["merged_essays"])

182080

In [59]:
cleaned_df["essays_words"] = ""
cleaned_df["essays_words"][0]

''

In [60]:
tokenizer = RegexpTokenizer(r'\w+')

#n = 0
#for i in range(0, len(cleaned_df["merged_essays"])):
#    if (n % 1000) == 0:
#        print(n)
#        n = n + 1
#    essay_words = [word for word in tokenizer.tokenize(cleaned_df["merged_essays"][i].lower()) if word not in stopwords.words('english')]
#    cleaned_df["essays_words"][i] = essay_words

In [61]:
cleaned_df["total_price_category"].unique()

[501-1000, 251-500, 101-250, >1000, 0-100]
Categories (5, object): [0-100 < 101-250 < 251-500 < 501-1000 < >1000]

In [62]:
cleaned_df["n_previous_projects"][0]

'26-50'

In [63]:
cleaned_df["project_is_approved"].unique()

array([1, 0])

In [64]:
cleaned_df["school_state"].unique()

array(['NV', 'GA', 'UT', 'NC', 'CA', 'DE', 'MO', 'SC', 'IN', 'IL', 'VA',
       'PA', 'NY', 'FL', 'NJ', 'TX', 'LA', 'ID', 'OH', 'OR', 'MD', 'WA',
       'MA', 'KY', 'AZ', 'MI', 'CT', 'AR', 'WV', 'NM', 'WI', 'MN', 'OK',
       'AL', 'TN', 'IA', 'KS', 'CO', 'DC', 'WY', 'NH', 'HI', 'SD', 'MT',
       'MS', 'RI', 'VT', 'ME', 'NE', 'AK', 'ND'], dtype=object)

In [65]:
cleaned_df["project_subject_subcategories"].unique()

array(['Literacy', 'Performing Arts, Team Sports',
       'Applied Sciences, Literature & Writing', 'Health & Wellness',
       'Character Education, Literature & Writing',
       'Early Development, Literature & Writing', 'Mathematics',
       'Literature & Writing, Mathematics', 'Literacy, Mathematics',
       'Character Education, Literacy', 'Literature & Writing',
       'Special Needs', 'ESL, Mathematics',
       'Health & Life Science, Mathematics',
       'College & Career Prep, Literature & Writing',
       'Early Development, Special Needs', 'Applied Sciences',
       'Early Development, Performing Arts', 'Mathematics, Special Needs',
       'Environmental Science, Literature & Writing',
       'Health & Wellness, Special Needs', 'Other, Special Needs',
       'Literacy, Social Sciences',
       'Health & Wellness, Literature & Writing', 'Music',
       'Literacy, Special Needs', 'Applied Sciences, Mathematics',
       'ESL, Literacy', 'Applied Sciences, Special Needs',
      

In [74]:
cleaned_df[['subcategory_1','subcategory_2', 'subcategory_3']] = cleaned_df['project_subject_subcategories'].str.split(', ', 3, expand=True)

In [75]:
cleaned_df

Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,...,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays,essays_words,subcategory_1,subcategory_2,subcategory_3
0,p036502,Ms.,NV,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,My students need 6 Ipod Nano's to create and d...,1,Literacy & Language,...,,501-1000,26-50,11,4,Super Sight Word Centers Most of my kindergart...,,Literacy,,
1,p039565,Mrs.,GA,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,My students need matching shirts to wear for d...,0,Music & The Arts,...,,251-500,0-1,4,2,Keep Calm and Dance On Our elementary school i...,,Performing Arts,Team Sports,
2,p233823,Ms.,UT,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,My students need the 3doodler. We are an SEM s...,1,Math & Science,...,,251-500,2-5,1,1,Lets 3Doodle to Learn Hello;\r\nMy name is Mrs...,,Applied Sciences,Literature & Writing,
3,p185307,Mr.,NC,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students need balls and other activity equi...,0,Health & Sports,...,,501-1000,11-25,8,3,"\""Kid Inspired\"" Equipment to Increase Activit...",,Health & Wellness,,
4,p013780,Mr.,CA,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students need a water filtration system for...,1,Health & Sports,...,,501-1000,26-50,8,3,We need clean water for our culinary arts clas...,,Health & Wellness,,
5,p063374,Mrs.,DE,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My students need tablets in order to communic...,1,Applied Learning,...,,501-1000,0-1,11,4,Need to Reach Our Virtual Mentors!!! My kids ...,,Character Education,Literature & Writing,
6,p103285,Mrs.,MO,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,My students need stability stools and inflatab...,1,Health & Sports,...,,251-500,0-1,8,3,Active Kindergartners Kindergarten is the new ...,,Health & Wellness,,
7,p181781,Mrs.,SC,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,My students need wiggle stools to allow them t...,1,Applied Learning,...,,251-500,0-1,8,3,Fabulous Firsties-Wiggling to Learn! First gra...,,Early Development,Literature & Writing,
8,p114989,Ms.,IN,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My students need seating that allows the most ...,1,Math & Science,...,,251-500,11-25,9,3,Wobble Chairs Help Fidgety Kids Focus My seven...,,Mathematics,,
9,p191410,Mrs.,IL,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,My students need 2 youth sized reclining chair...,1,Literacy & Language,...,,101-250,11-25,9,3,Snuggle Up With A Good Book I teach first grad...,,Literacy,,


In [76]:
cleaned_df["subcategory_1"].unique()

array(['Literacy', 'Performing Arts', 'Applied Sciences',
       'Health & Wellness', 'Character Education', 'Early Development',
       'Mathematics', 'Literature & Writing', 'Special Needs', 'ESL',
       'Health & Life Science', 'College & Career Prep',
       'Environmental Science', 'Other', 'Music', 'Visual Arts',
       'History & Geography', 'Gym & Fitness', 'Warmth',
       'Extracurricular', 'Team Sports', 'Social Sciences',
       'Foreign Languages', 'Parent Involvement', 'Nutrition Education',
       'Community Service', 'Financial Literacy', 'Civics & Government',
       'Economics'], dtype=object)

In [77]:
cleaned_df["subcategory_2"].unique()

array([None, 'Team Sports', 'Literature & Writing', 'Mathematics',
       'Literacy', 'Special Needs', 'Performing Arts', 'Social Sciences',
       'Visual Arts', 'ESL', 'Parent Involvement', 'Extracurricular',
       'Environmental Science', 'Early Development', 'Music',
       'College & Career Prep', 'Care & Hunger', 'Health & Wellness',
       'Health & Life Science', 'History & Geography', 'Other',
       'Nutrition Education', 'Community Service', 'Foreign Languages',
       'Warmth', 'Economics', 'Character Education', 'Financial Literacy',
       'Gym & Fitness', 'Civics & Government'], dtype=object)

In [78]:
cleaned_df["subcategory_3"].unique()

array([None, 'Care & Hunger'], dtype=object)