In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
stemming = PorterStemmer()
stops = set(stopwords.words("english"))

def apply_cleaning_function_to_list(X):
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    return cleaned_X

def clean_text(raw_text):
    text = raw_text.lower()
    tokens = nltk.word_tokenize(text)
    token_words = [w for w in tokens if w.isalpha()]
    stemmed_words = [stemming.stem(w) for w in token_words]
    meaningful_words = [w for w in stemmed_words if not w in stops]
    joined_words = ( " ".join(meaningful_words))

    return joined_words

In [3]:
def create_bag_of_words(X):
    from sklearn.feature_extraction.text import CountVectorizer
    
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 ngram_range = (1,2), \
                                 max_features = 10000
                                ) 

    train_data_features = vectorizer.fit_transform(X)
    train_data_features = train_data_features.toarray()

    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf = TfidfTransformer()
    tfidf_features = tfidf.fit_transform(train_data_features).toarray()

    vocab = vectorizer.get_feature_names()
   
    return vectorizer, vocab, train_data_features, tfidf_features, tfidf

In [4]:
data = pd.read_csv('data/sns_record.csv')
data.columns = ['race','wealth','age','comment']
data_sampled = data#[data['race'] == 'white' ]

In [5]:
text_to_clean = list(data_sampled['comment'])
cleaned_text = apply_cleaning_function_to_list(text_to_clean)
data_sampled['cleaned_comment'] = cleaned_text
del cleaned_text

In [6]:
words = list(data_sampled['cleaned_comment'])
vectorizer, vocab, train_data_features, tfidf_features, tfidf = create_bag_of_words(words)
dist = np.sum(train_data_features, axis=0)

In [7]:
bag_dictionary = pd.DataFrame()
bag_dictionary['ngram'] = vocab
bag_dictionary['count'] = dist

# Sort by raw count
bag_dictionary.sort_values(by=['count'], ascending=False, inplace=True)
# Show top 10
print(bag_dictionary.head(100))

        ngram  count
4647     look    231
7081     room    147
6843     rent    122
7184  roommat    115
354     apart    109
...       ...    ...
3261     good     21
8608     unit     21
461     april     21
4265     larg     21
7028    right     21

[100 rows x 2 columns]


In [8]:
print(bag_dictionary)

             ngram  count
4647          look    231
7081          room    147
6843          rent    122
7184       roommat    115
354          apart    109
...            ...    ...
3415      ha drama      1
3414       ha dine      1
3413     ha commun      1
3412       ha club      1
9222  ïmore person      1

[9223 rows x 2 columns]


In [9]:
for i,v in enumerate(bag_dictionary['ngram']):
    print(i,v)

0 look
1 room
2 rent
3 roommat
4 apart
5 move
6 bedroom
7 would
8 hous
9 area
10 home
11 month
12 prefer
13 park
14 live
15 bathroom
16 includ
17 work
18 atlanta
19 locat
20 place
21 privat
22 clean
23 ha
24 util
25 veri
26 pet
27 leas
28 one
29 thi
30 also
31 like
32 furnish
33 space
34 current
35 quiet
36 ani
37 pleas
38 bath
39 need
40 get
41 love
42 share
43 open
44 someon
45 hi
46 around
47 interest
48 friendli
49 find
50 budget
51 know
52 two
53 dog
54 bed
55 onli
56 avail
57 femal
58 peopl
59 march
60 hello
61 end
62 walk
63 cat
64 well
65 thank
66 time
67 great
68 messag
69 commun
70 kitchen
71 year
72 pool
73 floor
74 close
75 fulli
76 everyon
77 midtown
78 free
79 closet
80 want
81 minut
82 come
83 fulli furnish
84 near
85 old
86 profession
87 washer dryer
88 stay
89 would like
90 min
91 downtown
92 washer
93 hey
94 dryer
95 good
96 unit
97 april
98 larg
99 right
100 new
101 price
102 access
103 gym
104 decatur
105 full
106 live room
107 make
108 take
109 look roommat
110 mus

970 kept
971 larg room
972 graphic
973 porch
974 virginia
975 background check
976 reason
977 virginia highland
978 least
979 peac
980 know ani
981 back yard
982 payment
983 overlook
984 bath hous
985 recent graduat
986 real
987 includ monthli
988 well kept
989 know interest
990 graphic design
991 partner look
992 peopl prefer
993 itp
994 tree
995 larg backyard
996 post
997 help would
998 onli one
999 hous around
1000 home recent
1001 move anytim
1002 current work
1003 sink
1004 move apart
1005 home quiet
1006 definit interest
1007 park edgewood
1008 home home
1009 smoke outsid
1010 cute
1011 jog
1012 smoke iím
1013 singl famili
1014 smart
1015 march move
1016 decemb
1017 hous area
1018 hous also
1019 park littl
1020 morningsid
1021 decatur inman
1022 decatur look
1023 pleas feel
1024 marta station
1025 smoke ani
1026 decatur minut
1027 mountain
1028 hot
1029 home rent
1030 place roommat
1031 decid
1032 hope come
1033 month would
1034 park includ
1035 danc
1036 marta bu
1037 month tota

1720 bath atlanta
1721 becom
1722 bath plu
1723 water includ
1724 appreci ani
1725 understand
1726 wonder
1727 applic fee
1728 bike
1729 wonder amen
1730 bigger
1731 block away
1732 amazon
1733 back commun
1734 thi home
1735 ac
1736 blogger
1737 walmart whole
1738 blogger event
1739 buzz
1740 zoom
1741 applianc kitchen
1742 wood
1743 avail fresh
1744 unit central
1745 averag
1746 websit
1747 veri safe
1748 campcreek
1749 babi
1750 amount
1751 around per
1752 around march
1753 apart pet
1754 apart room
1755 batteri
1756 avail end
1757 access kitchen
1758 access live
1759 clean well
1760 apart start
1761 updat
1762 stop right
1763 stone
1764 cabinet
1765 thi month
1766 want look
1767 around piedmont
1768 clean matur
1769 stori
1770 bc
1771 stone mountain
1772 third
1773 access veri
1774 america
1775 clean femal
1776 close airport
1777 area want
1778 uga
1779 close downtown
1780 carpet safe
1781 typic
1782 unit laundri
1783 street walmart
1784 art
1785 attend
1786 unit luxuri
1787 vibe
17

2337 prefer includ
2338 us goofi
2339 us locat
2340 prefer home
2341 prefer high
2342 prefer great
2343 us move
2344 utillit
2345 plu secur
2346 veterinari technician
2347 pianist
2348 pl let
2349 pl
2350 pit bunch
2351 pit
2352 veri import
2353 piedmont budget
2354 veri larg
2355 piedmont area
2356 veri motiv
2357 pictur note
2358 pictur need
2359 pictur differ
2360 pictur also
2361 veri near
2362 pick drop
2363 pick doe
2364 veri nice
2365 pick apart
2366 veri obedi
2367 pic seriou
2368 pic dog
2369 pic big
2370 veri outgo
2371 veri good
2372 veri friendli
2373 place allow
2374 place fit
2375 place look
2376 veri consider
2377 place leas
2378 place know
2379 place includ
2380 place imho
2381 veri cool
2382 place get
2383 place forreal
2384 place forest
2385 place fenc
2386 place apart
2387 place feel
2388 place entrepreneur
2389 place cost
2390 veri easygo
2391 place columbia
2392 place central
2393 place call
2394 place april
2395 veri fast
2396 veri flexibl
2397 pianist veri
2398 p

3219 neighborhood quaint
3220 neighborhood neighbor
3221 neighborhood minut
3222 within home
3223 within mile
3224 neighborhood max
3225 new home
3226 new includ
3227 nice coffe
3228 winward parkway
3229 nice also
3230 wine
3231 next weekend
3232 next thi
3233 wine night
3234 next scad
3235 next publix
3236 wing
3237 next phipp
3238 next month
3239 next gwinnett
3240 next georgia
3241 next exit
3242 wing hous
3243 next bridg
3244 winter
3245 new westsid
3246 new stainless
3247 winter time
3248 new refriger
3249 winward
3250 new peopl
3251 new kitchen
3252 within min
3253 within walkabl
3254 neighborhood east
3255 without children
3256 need sublet
3257 woman would
3258 need stay
3259 need sooner
3260 women anyth
3261 need someth
3262 women onli
3263 need sinc
3264 need roommat
3265 need roomat
3266 need room
3267 need ride
3268 need rent
3269 women roommat
3270 need noth
3271 wonít
3272 need non
3273 need month
3274 need inform
3275 wonít worri
3276 wood floor
3277 wood tile
3278 need h

3719 thi brand
3720 roommat requir
3721 roommat profession
3722 thi closet
3723 roommat replac
3724 thi combin
3725 thi definit
3726 roommat regardless
3727 roommat recent
3728 roommat reason
3729 roommat realli
3730 roommat quiet
3731 roommat push
3732 safe neighbor
3733 texa
3734 safe park
3735 school train
3736 secur access
3737 tenanc
3738 second stori
3739 second garag
3740 second closet
3741 second bedroom
3742 tenanc month
3743 tenant budget
3744 seattl need
3745 seattl
3746 search someon
3747 search roommat
3748 search room
3749 search quit
3750 tenant ever
3751 search area
3752 tenant move
3753 tenant prefer
3754 sculpt kitchen
3755 sculpt
3756 screen tv
3757 screen
3758 scientist amazon
3759 ten ft
3760 secur buzz
3761 secur camera
3762 temp
3763 seek quiet
3764 seek profession
3765 technolog
3766 seek place
3767 seek femal
3768 technolog fortun
3769 seek cat
3770 tell also
3771 seek anoth
3772 tell thi
3773 see time
3774 ten
3775 see friend
3776 see franni
3777 see either
37

4468 regularli onli
4469 regularli
4470 regul also
4471 regul
4472 regist siberian
4473 regist servic
4474 transfer atlanta
4475 regardless gender
4476 regardless
4477 quiet safe
4478 tv huge
4479 rent determin
4480 profession modera
4481 unabl
4482 profession live
4483 profession like
4484 profession job
4485 unabl anyth
4486 profession includ
4487 profession get
4488 underground
4489 underground park
4490 profession engin
4491 profession employ
4492 profession around
4493 underscor
4494 underscor end
4495 understand alway
4496 product audio
4497 understand respect
4498 produc song
4499 unfinish
4500 step
4501 unfortun cat
4502 unfortun dog
4503 process leas
4504 umass amherst
4505 profession ny
4506 tv mount
4507 profession privat
4508 ty read
4509 properti ha
4510 properti dishwash
4511 properti concern
4512 type
4513 proper introduct
4514 type drug
4515 typic person
4516 typic thing
4517 proper
4518 proof incom
4519 proof employ
4520 uga grad
4521 progress ani
4522 progress
4523 ug

5167 commun buslin
5168 commun quiet
5169 commun pool
5170 commun orient
5171 commun one
5172 commun need
5173 commun minut
5174 commun loung
5175 commun look
5176 commun larg
5177 commun good
5178 commun flexibl
5179 commun farm
5180 commun conveni
5181 combo space
5182 combo
5183 combin custom
5184 closet basement
5185 closet second
5186 closet rent
5187 closet person
5188 closet must
5189 closet live
5190 closet half
5191 closet ground
5192 closet fulli
5193 closet fit
5194 closet dresser
5195 closet dm
5196 closet bedroom
5197 closet bathroom
5198 closet avail
5199 closet thi
5200 closest entri
5201 closest
5202 closer citi
5203 closer area
5204 close walmart
5205 close sugarloaf
5206 close second
5207 close sandi
5208 close proxim
5209 close open
5210 close near
5211 close lot
5212 close kroger
5213 closet shelv
5214 closet vertic
5215 combin
5216 code secur
5217 columbia sc
5218 columbia
5219 collier hill
5220 collier
5221 collect interest
5222 collect busi
5223 collag park
5224 

6218 anyth ne
6219 anyth might
6220 anyth lower
6221 anyth let
6222 anyth excess
6223 anyth convers
6224 anyth cloth
6225 anyth around
6226 anywher allow
6227 anyth appreci
6228 anyon want
6229 anyon rent
6230 anyon need
6231 anyon help
6232 anybodi subleas
6233 anybodi know
6234 anti social
6235 anti
6236 answer ani
6237 answer
6238 ansley park
6239 ansley
6240 anyway lem
6241 anywher nearbi
6242 apart messag
6243 apart gate
6244 apart mark
6245 apart mainli
6246 apart maa
6247 apart long
6248 apart level
6249 apart less
6250 apart leas
6251 apart hunt
6252 apart heart
6253 apart gym
6254 apart great
6255 apart get
6256 apart gener
6257 apart full
6258 anywher rel
6259 apart frisco
6260 apart five
6261 apart end
6262 apart electr
6263 apart downtown
6264 apart could
6265 apart cool
6266 apart buckhead
6267 apart belara
6268 apart bathroom
6269 apart avail
6270 apart atlanata
6271 apart amaz
6272 april love
6273 april may
6274 april move
6275 around one
6276 artist love
6277 artist als

6717 belt
6718 belli rub
6719 belli
6720 believ southern
6721 bedroom includ
6722 bedroom hope
6723 bedroom full
6724 bear recent
6725 becam one
6726 becam
6727 beauti terrac
6728 beauti swan
6729 beauti sunset
6730 beauti rooftop
6731 beauti remodel
6732 beauti privat
6733 beauti lake
6734 beauti german
6735 beauti friend
6736 beauti bth
6737 beat
6738 bear
6739 becaus feel
6740 beagl well
6741 beagl
6742 bdr suit
6743 bdr
6744 bd bath
6745 bd
6746 bc energet
6747 bc desper
6748 bbq veget
6749 bbq
6750 batteri two
6751 batteri brave
6752 bathroom wifi
6753 becaus corona
6754 becaus like
6755 bedroom empti
6756 bed privat
6757 bedroom doe
6758 bedroom current
6759 bedroom closet
6760 bedroom big
6761 bedroom bed
6762 bedroom access
6763 bed wooden
6764 bed wardrob
6765 bed ton
6766 bed thi
6767 bed talk
6768 bed spare
6769 bed size
6770 bed price
6771 becaus sooo
6772 bed one
6773 bed live
6774 bed inman
6775 bed desk
6776 bed closet
6777 bed box
6778 becuas look
6779 becuas
6780 becom

7217 late older
7218 late night
7219 last year
7220 larg bedroom
7221 last roommat
7222 last part
7223 larg wrap
7224 larg window
7225 larg roommat
7226 larg quartz
7227 larg portion
7228 larg outdoor
7229 larg kitchen
7230 larg fridg
7231 larg fenc
7232 larg enough
7233 larg corner
7234 know miracul
7235 know look
7236 know know
7237 key bug
7238 kirkwood apart
7239 king charl
7240 king
7241 kind roommat
7242 kind lush
7243 kind freak
7244 kid pet
7245 kid look
7246 kid drama
7247 kid coupl
7248 kid art
7249 key name
7250 key ie
7251 key apart
7252 kirkwood eav
7253 kept veri
7254 kept ha
7255 kept commun
7256 kennesaw clean
7257 kennesaw
7258 keep would
7259 keep share
7260 keep room
7261 keep occupi
7262 keep lot
7263 keep littl
7264 keep indoor
7265 keep common
7266 kirkwood area
7267 kirkwood morningsid
7268 know ideal
7269 kitchen tool
7270 know hav
7271 know good
7272 know differ
7273 know clean
7274 know anyon
7275 kitti sleep
7276 kitti margo
7277 kitti lot
7278 kitti look
727

7847 midtown old
7848 might meet
7849 might interest
7850 might good
7851 might go
7852 midtown yorki
7853 midtown work
7854 midtown surround
7855 midtown start
7856 midtown rent
7857 midtown prefer
7858 midtown pleas
7859 midtown place
7860 midtown open
7861 midtown north
7862 midsiz
7863 midtown neighborhood
7864 midtown negoti
7865 midtown morningsid
7866 midtown marta
7867 midtown leas
7868 midtown ksu
7869 midtown grant
7870 midtown go
7871 midtown buckhead
7872 midtown atl
7873 midtown area
7874 midtown applic
7875 midsiz vehicl
7876 meet new
7877 meet like
7878 master bath
7879 max open
7880 may happi
7881 may end
7882 may earlier
7883 may current
7884 may church
7885 may around
7886 may allow
7887 maximum monthli
7888 maximum amount
7889 max would
7890 max privat
7891 max possibl
7892 max plu
7893 max need
7894 may look
7895 max look
7896 max budget
7897 matur woman
7898 matur enough
7899 matter would
7900 matter lgbtq
7901 match need
7902 match
7903 master privat
7904 master o

8467 expens home
8468 expens fall
8469 expens clean
8470 expect roommat
8471 extra privaci
8472 expans green
8473 expans
8474 exit less
8475 exit
8476 exist lol
8477 exist home
8478 exhaust peopl
8479 exhaust
8480 exercis outdoor
8481 exercis
8482 execut assist
8483 execut
8484 excitingli marta
8485 extra larg
8486 extra privat
8487 exchang life
8488 fall right
8489 farm extra
8490 far max
8491 far atlanta
8492 fan room
8493 fan quirki
8494 fan exercis
8495 fan curb
8496 famili would
8497 famili room
8498 famili orient
8499 famili dollar
8500 famili ani
8501 fall would
8502 fairli relax
8503 extrem conveni
8504 fairli open
8505 fairli extrovert
8506 fairburn work
8507 fairburn
8508 fact graphic
8509 fact
8510 face south
8511 face
8512 fabric soften
8513 fabric
8514 extrovert would
8515 extrovert
8516 extrem spaciou
8517 excitingli
8518 exchang
8519 farm well
8520 even bigger
8521 ever
8522 eventu friendship
8523 eventu
8524 event ton
8525 event coordin
8526 even would
8527 even veri
85