# Stock Sentiment Analysis using News Headlines

The data is from Kaggle and here we have to predict whether stock price will increase or decrease based on news headlines. Date column have the date, label is 1 or 0. 1 means the stock price will increase, 0 means it will remain same or decrease. All the other columns containt the top news headlines of the day.

In [2]:
import pandas as pd

ModuleNotFoundError: No module named 'pandas'

In [None]:
data_path = "data/data.csv"

In [None]:
df = pd.read_csv(data_path, encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# removing punctuations
data = train.iloc[:, 2:27]
data.replace("[^a-zA-Z]", " ", regex=True, inplace=True)

# renaming column names
new_cols = [str(i) for i in range(25)]
data.columns = new_cols

In [None]:
data.head(5)

In [None]:
# converting headlines to lowercase
for col in new_cols:
    data[col] = data[col].str.lower()

In [None]:
data.head()

In [None]:
# combining all the headlines to one paragraph
headlines = []
for row in range(0, len(data.index)):
    headlines.append(' '.join(str(x) for x in data.iloc[row, 0:25] ))

In [None]:
headlines[0]

### Building the model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Let's write some utility functions

In [None]:
def build_model(X_train, y_train, X_test, y_test):

    models = {
        "rf": RandomForestClassifier(n_jobs=-1),
        "lr": LogisticRegression(),
        "mnb": MultinomialNB()
    }

    for model in models:
        model = models[model]
        model.fit(X_train, y_train)

        predictions = model.predict(X_test)
        
        print(f"Model: {model}")
        print(accuracy_score(y_test, predictions))
        print(confusion_matrix(y_test, predictions))
        print(classification_report(y_test, predictions))

        print("="*100)


    

### Bag of words
Let's first start with a simple bag of words model



In [None]:
# implement bag of words
countvector = CountVectorizer(ngram_range=(2,2))
X_train = countvector.fit_transform(headlines)

In [None]:
y_train = train['Label']

test_transform = []
for row in range(0, len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row, 2:27]))

X_test = countvector.transform(test_transform)
y_test = test['Label']

In [None]:
build_model(X_train, y_train, X_test, y_test)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(headlines).toarray()

In [None]:
test_transform = []
for row in range(0, len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row, 2:27]))

X_test = tfidf.transform(test_transform)

In [None]:
build_model(X_train, y_train, X_test, y_test)