# Depression from tweets detection #

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
import warnings
warnings.filterwarnings("ignore")

# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '6ce0c011-aa35-4425-9b6b-e8d8d8a1b919'
resource_group = 'projekt1_grupa6'
workspace_name = 'depresja_projekt'

workspace = Workspace(subscription_id, resource_group, workspace_name)

train_data1 = Dataset.get_by_name(workspace, name='train_data1')
train_data2 = Dataset.get_by_name(workspace, name='train_data')

In [3]:
train_data1 = train_data1.to_pandas_dataframe()
train_data2 = train_data2.to_pandas_dataframe()

## Joining two train datasets ##

In [4]:
train_data2 = train_data2[['text', 'class']]

train_data1.columns = ['text','class']
train_data = pd.concat([train_data1, train_data2])

In [5]:
train_data['class'].value_counts()

non-suicide     116037
suicide         116037
depression       10371
SuicideWatch      9992
Name: class, dtype: int64

In [6]:
train_data['class'] = train_data['class'].replace(['suicide','depression',"SuicideWatch"], 1)

In [7]:
train_data['class'] = train_data['class'].replace(['non-suicide'], 0)

In [8]:
train_data['class'].value_counts()

1    136400
0    116037
Name: class, dtype: int64

# Dividing into train and test #

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_data['text'],
                                                    train_data['class'],
                                                    test_size=0.2,
                                                    random_state=0)

## Creating bag of words ##

Bag of words = matrix which consists of count how many times each word appears in tweet, so where rows means tweets, columns are equal to particular words and values are counts of each word in sentence

In [10]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,2),
                             min_df=10,
                             strip_accents='unicode')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_train_vectorized
len(vectorizer.get_feature_names())

153652

## Comparing classifiers ##

In [13]:
def classify(classifier, vectorizer):

    classifier.fit(X_train_vectorized, y_train)

    predictions = classifier.predict(X_train_vectorized)
    print('Train Accuracy: ', accuracy_score(y_train, predictions))
    cm = confusion_matrix(y_train, predictions)
    print(cm)

    predictions = classifier.predict(X_test_vectorized)
    print('Test Accuracy: ', accuracy_score(y_test, predictions))
    cm = confusion_matrix(y_test, predictions)
    print(cm)

In [12]:
def find_best_features(classifier, vactorizer):
    feature_names = np.array(vectorizer.get_feature_names())
    sorted_coef_index = classifier.coef_[0].argsort()
    print('Smallest Coefs:\n{}\n'.format(
        feature_names[sorted_coef_index[:10]]))
    print('Largest Coefs: \n{}'.format(
        feature_names[sorted_coef_index[:-11:-1]]))

In [15]:
classifier_log = LogisticRegression(max_iter=3000)
classify(classifier_log, vectorizer)

Train Accuracy:  0.9516907734130895
[[ 88350   4473]
 [  5283 103843]]
Test Accuracy:  0.936400728886072
[[21748  1466]
 [ 1745 25529]]


In [16]:
find_best_features(classifier_log,  vectorizer)

Smallest Coefs:
['crush' 'teenagers' 'dm' 'kinda' 'discord' 'horny' 'minecraft' 'bruh'
 'rant' 'bored']

Largest Coefs: 
['suicide' 'suicidal' 'kill' 'life' 'depression' 'killing' 'anymore' 'die'
 'end' 'pills']
