# Stock Sentiment Analysis 

### 1 - Stock Price increased
### 0 - Stock Price decreased or stayed same

In [1]:
! pip install nltk
! pip install pandas
! pip install -U scikit-learn



In [2]:
import nltk
import pandas as pd
import sklearn

In [3]:
dataframe = pd.read_csv('Dataset.csv',encoding = "ISO-8859-1")
dataframe.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite
2,2000-01-05,0,Coventry caught on counter by Flo,United's rivals on the road to Rio,Thatcher issues defence before trial by video,Police help Smith lay down the law at Everton,Tale of Trautmann bears two more retellings,England on the rack,Pakistan retaliate with call for video of Walsh,Cullinan continues his Cape monopoly,...,South Melbourne (Australia),Necaxa (Mexico),Real Madrid (Spain),Raja Casablanca (Morocco),Corinthians (Brazil),Tony's pet project,Al Nassr (Saudi Arabia),Ideal Holmes show,Pinochet leaves hospital after tests,Useful links
3,2000-01-06,1,Pilgrim knows how to progress,Thatcher facing ban,McIlroy calls for Irish fighting spirit,Leicester bin stadium blueprint,United braced for Mexican wave,"Auntie back in fashion, even if the dress look...",Shoaib appeal goes to the top,Hussain hurt by 'shambles' but lays blame on e...,...,Putin admits Yeltsin quit to give him a head s...,BBC worst hit as digital TV begins to bite,How much can you pay for...,Christmas glitches,"Upending a table, Chopping a line and Scoring ...","Scientific evidence 'unreliable', defence claims",Fusco wins judicial review in extradition case,Rebels thwart Russian advance,Blair orders shake-up of failing NHS,Lessons of law's hard heart
4,2000-01-07,1,Hitches and Horlocks,Beckham off but United survive,Breast cancer screening,Alan Parker,Guardian readers: are you all whingers?,Hollywood Beyond,Ashes and diamonds,Whingers - a formidable minority,...,Most everywhere: UDIs,Most wanted: Chloe lunettes,Return of the cane 'completely off the agenda',From Sleepy Hollow to Greeneland,Blunkett outlines vision for over 11s,"Embattled Dobson attacks 'play now, pay later'...",Doom and the Dome,What is the north-south divide?,Aitken released from jail,Gone aloft


In [4]:
train_data = dataframe[dataframe['Date'] < '20150101'] 
test_data = dataframe[dataframe['Date'] > '20141231'] 

In [5]:
# Data Preprocessing & Cleaning

# Removing Punctuations 

train_data_cleaned = train_data.iloc[:,2:27]
train_data_cleaned.replace("[^a-zA-Z]", " " ,regex=True,inplace=True)

# Renaming Columns

train_data_cleaned.columns = [str(i) for i in range(25)]


In [6]:
# Converting into lowercase

for i in range(25):
    train_data_cleaned[str(i)] = train_data_cleaned[str(i)].str.lower()
train_data_cleaned.head(1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title


In [7]:
# Converting each index into a single sentence & appending it into a list then applying Bag of Words Model

headlines=[]
for row in range(0,len(train_data_cleaned.index)):
    headlines.append(' '.join(str(x) for x in train_data_cleaned.iloc[row,0:25]))
    

In [8]:
# Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

cv = CountVectorizer(ngram_range = (2,2))
train_data_modeled = cv.fit_transform(headlines)


In [9]:
#  Random Forest Classifier

rc = RandomForestClassifier(n_estimators=200,criterion='entropy')
rc.fit(train_data_modeled,train_data["Label"])


In [10]:
# Predicting for test dataset

test_transform=[]
for row in range(0,len(test_data.index)):
    test_transform.append(' '.join(str(x) for x in test_data.iloc[row,2:27]))
    
test_data_modeled = cv.transform(test_transform)
prediction = rc.predict(test_data_modeled)



In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# Load dataset
file_path = 'Dataset.csv' 
dataframe = pd.read_csv(file_path, encoding='ISO-8859-1')

# Filter for Apple-related news
apple_related = dataframe.iloc[:, 2:27].apply(lambda x: x.str.contains('Apple|AAPL', case=False, na=False))
apple_data = dataframe[apple_related.any(axis=1)]

# Split data into train and test sets
train_data = apple_data[apple_data['Date'] < '20150101']
test_data = apple_data[apple_data['Date'] > '20141231']

# Data Preprocessing
def preprocess_data(data):
    data_cleaned = data.iloc[:, 2:27]
    data_cleaned.replace("[^a-zA-Z]", " ", regex=True, inplace=True)
    data_cleaned.columns = [str(i) for i in range(25)]

    for i in range(25):
        data_cleaned[str(i)] = data_cleaned[str(i)].str.lower()

    headlines = []
    for row in range(0, len(data_cleaned.index)):
        headlines.append(' '.join(str(x) for x in data_cleaned.iloc[row, 0:25]))
    
    return headlines

train_headlines = preprocess_data(train_data)
test_headlines = preprocess_data(test_data)

# Vectorization
cv = CountVectorizer(ngram_range=(2, 2))
train_data_modeled = cv.fit_transform(train_headlines)

# Train the Random Forest Classifier
rc = RandomForestClassifier(n_estimators=200, criterion='entropy')
rc.fit(train_data_modeled, train_data["Label"])

# Prepare Test Data
test_data_modeled = cv.transform(test_headlines)

# Prediction
predictions = rc.predict(test_data_modeled)

# Output Predictions with Dates
test_dates = test_data['Date'].reset_index(drop=True)
predicted_movements = pd.DataFrame({
    'Date': test_dates,
    'Prediction': predictions
})
predicted_movements['Predicted Movement'] = predicted_movements['Prediction'].apply(lambda x: 'Increase' if x == 1 else 'Decrease/Stagnant')

print(predicted_movements[['Date', 'Predicted Movement']])


          Date Predicted Movement
0   2015-02-26  Decrease/Stagnant
1   2015-05-27           Increase
2   2015-07-21  Decrease/Stagnant
3   2015-07-28           Increase
4   2015-09-21           Increase
5   2015-09-29           Increase
6   2015-10-15           Increase
7   2015-11-10           Increase
8   2015-11-20           Increase
9   2015-12-22           Increase
10  2015-12-30  Decrease/Stagnant
11  2016-01-14           Increase
12  2016-02-01           Increase
13  2016-03-09           Increase
14  2016-03-30           Increase
15  2016-05-04           Increase


In [12]:
# Importing sklearn libraries for checking accuracy, confusion matrix, classification report

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [14]:
# Classification Report

Classification_report = classification_report(test_data['Label'],prediction)
print(Classification_report)

# Confusion Matrix

Confusion_matrix = confusion_matrix(test_data['Label'],prediction)
print(Confusion_matrix)

# Accuracy Score

Accuracy = accuracy_score(test_data['Label'],prediction)
print(Accuracy)


ValueError: Found input variables with inconsistent numbers of samples: [16, 378]