In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# nltk.download('vader_lexicon')

from tqdm.notebook import tqdm
from sklearn.utils import resample

from sklearn import preprocessing


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mikes\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
def get_train_test():
    cwd = os.getcwd()
    # print(cwd)

    df_train = pd.read_csv(cwd + "\data\drugsComTrain_raw.tsv", sep='\t')
    df_test = pd.read_csv(cwd + "\data\drugsComTest_raw.tsv", sep='\t')

    df_list = [df_train, df_test]

    # print(f"train_len={train_len} test_len={test_len} df_len={len(df)}")
    print(df_train.columns)

    df_train['drugName'] = df_train['drugName'].str.upper()
    number_of_drugs = len(df_train['drugName'].unique())
    print(f"Number of drugs = {number_of_drugs}")

    df_train['condition'] = df_train['condition'].str.upper()
    number_of_drugs = len(df_train['condition'].unique())
    print(f"Number of condition = {number_of_drugs}")

    # Generate columns for review length and positive reviews.  If the rating is greater than 7 of 10, assume it is positive
    for df in df_list:
        df['review_len'] = df['review'].str.len()
        df['is_positive'] = np.where(df['rating'] > 7, 1, 0)

    df_train = balance_class(df_train, 'is_positive')
    df_test = balance_class(df_test, 'is_positive')
    return df_train, df_test

def balance_class(df, colname):
    class1 = df[df[colname] == 1].copy()
    class2 = df[df[colname] == 0].copy()

    if len(class1) > len(class2):
        maj_class = class1
        min_class = class2
    else:
        maj_class = class2
        min_class = class1 

    maj_downsample = resample(maj_class, replace=False, n_samples=len(min_class), random_state=0)

    df_balanced = pd.concat([min_class, maj_downsample])
    return df_balanced

# https://www.nltk.org/howto/sentiment.html
def add_vader(df):
    neg_list, pos_list, compound_list = [], [], []

    sid = SentimentIntensityAnalyzer()
    #for index, value in df['review'].items():
    for index, row in tqdm(df.iterrows(), total=len(df)):
        ss = sid.polarity_scores(row['review'])
        neg_list.append(ss['neg'])
        pos_list.append(ss['pos'])
        compound_list.append(ss['compound'])

    df['neg'] = neg_list
    df['pos'] = pos_list
    df['compound'] = compound_list
    return df

In [None]:
df_train, df_test = get_train_test()

# Create dataframe with the length of the review as the only feature
X_train = df_train[['review_len']].copy()
y_train = df_train['is_positive'].copy()
X_test = df_test[['review_len']].copy()
y_test = df_test['is_positive'].copy()

# Get shapes of dataframes
print(f"X_train.shape={X_train.shape}, y_train.shape={y_train.shape}")
print(f"X_test.shape={X_test.shape}, y_test.shape={y_test.shape}")

# Get VADER columns for negative, positive, and compund values
X_train_vader = add_vader(df_train)
X_test_vader = add_vader(df_test)

# Add columns to dataset
for colname in ['neg', 'pos', 'compound']:
    X_train[colname] = X_train_vader[colname]
    X_test[colname] = X_test_vader[colname]

X_train.shape=(127774, 1), y_train.shape=(127774,)
X_test.shape=(42834, 1), y_test.shape=(42834,)


  0%|          | 0/127774 [00:00<?, ?it/s]

  0%|          | 0/42834 [00:00<?, ?it/s]

In [57]:
print(f"X_train.shape={X_train.shape}, y_train.shape={y_train.shape}")
print(f"X_test.shape={X_test.shape}, y_test.shape={y_test.shape}")

scaler_train = preprocessing.StandardScaler().fit(X_train)
# Create and fit scaler object for test data
scaler_test = preprocessing.StandardScaler().fit(X_test)
# Scaled version of x_train
x_train_scale = scaler_train.transform(X_train)
# Scaled version of x_train
x_test_scale = scaler_test.transform(X_test)

# Build logistic model and fit
model = LogisticRegression()
model.fit(x_train_scale, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test_scale)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy={accuracy:.4f}")

X_train.shape=(127774, 4), y_train.shape=(127774,)
X_test.shape=(42834, 4), y_test.shape=(42834,)
Accuracy=0.6287
