In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
dataset = pd.read_csv('stress_train.csv')
x = dataset['text']
y = dataset['label']

In [3]:
# Check for missing values
missing_values = x.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 0


In [4]:
print(x)

0       We get no child support and are doing well fin...
1       &#x200B; Hey everyone, Being that Hurricane Fl...
2       Woke up 4 days later, intubated and paralyzed....
3       I’m also quite intellectual, I can speak 2 lan...
4       I don't need that. This sucks but is far from ...
                              ...                        
1812    [Facebook <url> [Criss <url> Please help and s...
1813    Every day I'd tell myself I was over this girl...
1814    I hate asking for help and I would ask my fami...
1815    I did something once that I had been doing alm...
1816    Good Morning, I am extremely moved by the memb...
Name: text, Length: 1817, dtype: object


In [5]:
print(y)

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
1812    0.0
1813    1.0
1814    1.0
1815    0.0
1816    0.0
Name: label, Length: 1817, dtype: float64


In [6]:
# Clean the text data
x = x.astype(str).apply(lambda text: re.sub('[^a-zA-Z]', ' ', text).lower())


In [7]:
print(x)

0       we get no child support and are doing well fin...
1         x   b  hey everyone  being that hurricane fl...
2       woke up   days later  intubated and paralyzed ...
3       i m also quite intellectual  i can speak   lan...
4       i don t need that  this sucks but is far from ...
                              ...                        
1812     facebook  url   criss  url  please help and s...
1813    every day i d tell myself i was over this girl...
1814    i hate asking for help and i would ask my fami...
1815    i did something once that i had been doing alm...
1816    good morning  i am extremely moved by the memb...
Name: text, Length: 1817, dtype: object


In [8]:
# Tokenize the text
x = x.apply(nltk.word_tokenize)

In [9]:
print(x)

0       [we, get, no, child, support, and, are, doing,...
1       [x, b, hey, everyone, being, that, hurricane, ...
2       [woke, up, days, later, intubated, and, paraly...
3       [i, m, also, quite, intellectual, i, can, spea...
4       [i, don, t, need, that, this, sucks, but, is, ...
                              ...                        
1812    [facebook, url, criss, url, please, help, and,...
1813    [every, day, i, d, tell, myself, i, was, over,...
1814    [i, hate, asking, for, help, and, i, would, as...
1815    [i, did, something, once, that, i, had, been, ...
1816    [good, morning, i, am, extremely, moved, by, t...
Name: text, Length: 1817, dtype: object


In [10]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
x = x.apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
print(x)

0       [get, child, support, well, financially, witho...
1       [x, b, hey, everyone, hurricane, florence, occ...
2       [woke, days, later, intubated, paralyzed, know...
3       [also, quite, intellectual, speak, languages, ...
4       [need, sucks, far, hopeless, help, next, bad, ...
                              ...                        
1812    [facebook, url, criss, url, please, help, shar...
1813    [every, day, tell, girl, see, school, instantl...
1814    [hate, asking, help, would, ask, family, disow...
1815    [something, almost, constantly, intervening, y...
1816    [good, morning, extremely, moved, members, sub...
Name: text, Length: 1817, dtype: object


In [12]:
# Apply stemming
stemmer = PorterStemmer()
x = x.apply(lambda x: [stemmer.stem(word) for word in x])

In [13]:
print(x)

0       [get, child, support, well, financi, without, ...
1       [x, b, hey, everyon, hurrican, florenc, occur,...
2       [woke, day, later, intub, paralyz, know, lock,...
3       [also, quit, intellectu, speak, languag, nativ...
4       [need, suck, far, hopeless, help, next, bad, t...
                              ...                        
1812    [facebook, url, criss, url, pleas, help, share...
1813    [everi, day, tell, girl, see, school, instantl...
1814    [hate, ask, help, would, ask, famili, disown, ...
1815    [someth, almost, constantli, interven, year, i...
1816    [good, morn, extrem, move, member, sub, contri...
Name: text, Length: 1817, dtype: object


In [14]:
# Convert the list of lists back to strings
x = x.apply(lambda words: ' '.join(words))

In [15]:
x

0       get child support well financi without althoug...
1       x b hey everyon hurrican florenc occur less mo...
2       woke day later intub paralyz know lock suffer ...
3       also quit intellectu speak languag nativ langu...
4       need suck far hopeless help next bad thing cam...
                              ...                        
1812    facebook url criss url pleas help share word i...
1813    everi day tell girl see school instantli remin...
1814    hate ask help would ask famili disown move due...
1815    someth almost constantli interven year imagin ...
1816    good morn extrem move member sub contribut goo...
Name: text, Length: 1817, dtype: object

In [16]:
# Create vocabulary
vocabulary = set()
for text in x:
    vocabulary.update(nltk.word_tokenize(text))

print(vocabulary)

{'glass', 'valentin', 'learn', 'tube', 'ocpd', 'romania', 'unmand', 'stolen', 'everywher', 'counsellor', 'wrap', 'buss', 'rightli', 'iaff', 'flashback', 'intent', 'bump', 'contest', 'bartend', 'urgenc', 'kill', 'bland', 'mainten', 'tricycl', 'bff', 'http', 'atheist', 'erect', 'mouth', 'nap', 'jewish', 'address', 'notif', 'ltr', 'mayb', 'pregnant', 'yea', 'brother', 'stabl', 'sincer', 'exam', 'sacrific', 'preserv', 'loveless', 'word', 'forearm', 'tooth', 'indiffer', 'agit', 'teaser', 'medium', 'circumst', 'tbh', 'dotti', 'seth', 'sexi', 'cat', 'floor', 'decept', 'reveal', 'bulk', 'judg', 'gradual', 'conserv', 'textil', 'exact', 'curiou', 'lover', 'overdos', 'guilt', 'spoke', 'vanderbilt', 'feet', 'translat', 'linger', 'hire', 'war', 'matter', 'satisfi', 'email', 'ta', 'kink', 'declin', 'unten', 'concret', 'mate', 'behavior', 'bug', 'cours', 'assign', 'kneed', 'cpap', 'prey', 'piano', 'info', 'go', 'physiolog', 'campground', 'op', 'solidifi', 'handi', 'basket', 'clonazepam', 'furnish', '

In [17]:
vocabulary

{'glass',
 'valentin',
 'learn',
 'tube',
 'ocpd',
 'romania',
 'unmand',
 'stolen',
 'everywher',
 'counsellor',
 'wrap',
 'buss',
 'rightli',
 'iaff',
 'flashback',
 'intent',
 'bump',
 'contest',
 'bartend',
 'urgenc',
 'kill',
 'bland',
 'mainten',
 'tricycl',
 'bff',
 'http',
 'atheist',
 'erect',
 'mouth',
 'nap',
 'jewish',
 'address',
 'notif',
 'ltr',
 'mayb',
 'pregnant',
 'yea',
 'brother',
 'stabl',
 'sincer',
 'exam',
 'sacrific',
 'preserv',
 'loveless',
 'word',
 'forearm',
 'tooth',
 'indiffer',
 'agit',
 'teaser',
 'medium',
 'circumst',
 'tbh',
 'dotti',
 'seth',
 'sexi',
 'cat',
 'floor',
 'decept',
 'reveal',
 'bulk',
 'judg',
 'gradual',
 'conserv',
 'textil',
 'exact',
 'curiou',
 'lover',
 'overdos',
 'guilt',
 'spoke',
 'vanderbilt',
 'feet',
 'translat',
 'linger',
 'hire',
 'war',
 'matter',
 'satisfi',
 'email',
 'ta',
 'kink',
 'declin',
 'unten',
 'concret',
 'mate',
 'behavior',
 'bug',
 'cours',
 'assign',
 'kneed',
 'cpap',
 'prey',
 'piano',
 'info',
 '

In [18]:
# Transform the text data
vectorizer = CountVectorizer(vocabulary=vocabulary)
x_transformed = vectorizer.fit_transform(x)

In [19]:
x_transformed

<1817x5976 sparse matrix of type '<class 'numpy.int64'>'
	with 60039 stored elements in Compressed Sparse Row format>

In [20]:
# Check for missing values
missing_values = y.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 1


In [21]:
# Fill missing values in y with the most frequent class
y = y.fillna(y.mode().iloc[0])

In [22]:
# Check for missing values
missing_values = y.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 0


In [23]:
y

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
1812    0.0
1813    1.0
1814    1.0
1815    0.0
1816    0.0
Name: label, Length: 1817, dtype: float64

In [24]:
x

0       get child support well financi without althoug...
1       x b hey everyon hurrican florenc occur less mo...
2       woke day later intub paralyz know lock suffer ...
3       also quit intellectu speak languag nativ langu...
4       need suck far hopeless help next bad thing cam...
                              ...                        
1812    facebook url criss url pleas help share word i...
1813    everi day tell girl see school instantli remin...
1814    hate ask help would ask famili disown move due...
1815    someth almost constantli interven year imagin ...
1816    good morn extrem move member sub contribut goo...
Name: text, Length: 1817, dtype: object

In [25]:
# Filter out missing values in y
x_transformed = x_transformed[y.notnull()]
y = y[y.notnull()]

In [26]:
# Train the MNB classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(x_transformed, y)

MultinomialNB()

In [27]:
dataset = pd.read_csv('stress_test.csv')
x = dataset['text']

In [28]:
dataset = pd.read_csv('stress_test.csv')
x_test = dataset['text']

# Check for missing values
missing_values = x_test.isnull().sum()
print("Missing values:\n", missing_values)

# Clean the text data
x_test = x_test.astype(str).apply(lambda text: re.sub('[^a-zA-Z]', ' ', text).lower())

print(x_test)

# Tokenize the text
x_test = x_test.apply(nltk.word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
x_test = x_test.apply(lambda x_test: [word for word in x_test if word not in stop_words])

print(x_test)

# Apply stemming
stemmer = PorterStemmer()
x_test = x_test.apply(lambda x_test: [stemmer.stem(word) for word in x_test])

print(x_test)

# Convert the list of lists back to strings
x_test = x_test.apply(lambda words: ' '.join(words))

# Create vocabulary
#vocabulary_test = set()
#for text in x_test:
#    vocabulary_test.update(text)
    
#print(vocabulary_test)
    
    
# Transform the text data
vectorizer = CountVectorizer(vocabulary=vocabulary)
x_transformed_test = vectorizer.transform(x_test)






Missing values:
 0
0      i m in my third year at uni and starting my di...
1      as mentioned above  my girlfriends gets pissed...
2      after unsuccessfully looking for a girlfriend ...
3      i wanted this addressed and my advocate said t...
4      i don t have any point  i m just overwhelmed a...
                             ...                        
448    the messages are essentially the guy being fli...
449    this has  obviously  created some crushing ten...
450    earlier that day i was talking to a customer  ...
451    in order to be able to walk  i need donations ...
452    he brought me to hang out with the two of them...
Name: text, Length: 453, dtype: object
0      [third, year, uni, starting, diss, due, januar...
1      [mentioned, girlfriends, gets, pissed, act, un...
2      [unsuccessfully, looking, girlfriend, dating, ...
3      [wanted, addressed, advocate, said, next, pros...
4      [point, overwhelmed, want, give, one, going, r...
                             .

In [29]:
# Predict the labels for the test data
y_pred = nb_classifier.predict(x_transformed_test)
y_pred = y_pred.astype(int)
# Print the predicted labels
print(y_pred)


[1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1
 1 0 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 0
 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0
 1 1 0 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 1 0 0 1 1
 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 1
 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0
 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1
 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 0 0 0 1 1
 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 1 0
 0 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0
 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 0
 1 1 1 1 0 1 0 0 0]


In [30]:
pd.DataFrame(y_pred).to_csv("Iterators-group5-exercise3.csv")

In [32]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation
k = 5  # Number of folds
scores = cross_val_score(nb_classifier, x_transformed_test.toarray(), y_pred, cv=k, scoring='accuracy')

# Calculate mean accuracy
mean_accuracy = scores.mean()
print("Mean Accuracy: {:.2f}%".format(mean_accuracy * 100))


Mean Accuracy: 77.94%
