In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.svm import SVC
import re

In [12]:
def cleaner(impure_data):
    temp_list = []
    for item in impure_data:
        #finding words which start with @
        item = re.sub('@\S+', '', item)
        
        #finding words which start with http
        item = re.sub('http\S+\s*', '', item)
        
        #finding special characters, but not "emoji"
        item = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', item)
        temp_list.append(item)
    return temp_list

In [18]:
def news_sentiment():
    #reading the news from csv files
    df = pd.read_csv("C://Users//kkesh//Desktop//financialNews.csv")
    #df.columns = ['Title', 'Decisions']
    l = [i.split(',') for i in df['Decisions']]
    l1 = [i for i in range(len(l)) if len(l[i])>1]
    # print(l)
    idx = [i for i in range(len(df)) if i not in l1]
    df2 = df.filter(items=idx,axis=0)
    x = df2['Decisions'][0].split(':')[1][1:-1][1:-1]
    df2['sentiment'] = [i.split(':')[1][1:-1][1:-1] for i in df2['Decisions']]
    news = df2["Title"]
    polarity = df2["sentiment"].tolist()
    print(len(polarity))
    
    #cleaning news i.e. removing :, "", http(s) links and special characters such as punctuations
    clean_news = cleaner(news)
    
    #initializing tf-idf vectorizer
    tf_idfvectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    
    #splitting the data into train and test dataset in 70 : 30 ratio at random
    X_train, X_test, Y_train, Y_test = train_test_split(clean_news, polarity, test_size = 0.3)
    
    #vectorizing the training data
    #fit_transform() does two jobs, fit() and transform()
    #fit calculates the statistics of the data
    #transform takes care of any missing values or unexpected values by utilizing statistics calculated by fit
    train_corpus_tf_idf = tf_idfvectorizer.fit_transform(X_train) 
        
    #vectorizing the testing data
    #transform takes care of any missing values or unexpected values based on fit for training data
    test_corpus_tf_idf = tf_idfvectorizer.transform(X_test)
    
    #using SVC package to initialize a classifier with Linear kernel and other default parameters
    SVM_L = SVC(kernel= 'linear',C=1)
    
    #fitting the sparse matrix in the classifier with their respective sentiments
    SVM_L.fit(train_corpus_tf_idf, Y_train)
    
    #predicting the sentiments for the test dataset
    Y_pred = SVM_L.predict(test_corpus_tf_idf)
    Y_train_pred = SVM_L.predict(train_corpus_tf_idf)
    
    #this print accuracy score for the test dataset
    print("Train Accuracy",accuracy_score(Y_train,Y_train_pred))
    print("Test Accuracy",accuracy_score(Y_test,Y_pred))
    
    
    #saving the data into a csv file in the current folder
    temp_df = pd.DataFrame()
    temp_df["News"] = X_test
    temp_df["Sentiment"] = Y_test
    temp_df["Predicted Sentiment"] = Y_pred
    temp_df.to_csv("predicted_financialNews.csv")

    return(tf_idfvectorizer,SVM_L)

In [19]:
vectorizer,model = news_sentiment()

7897
Train Accuracy 0.9509679753935227
Test Accuracy 0.8021097046413502


In [20]:
vector = vectorizer.transform(["Rebound for Russia and China lifts stocks"])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [21]:
vector = vectorizer.transform(["Sebi seeks clarification on RINL IPO"])
sentiment = model.predict(vector)
print(sentiment)

['neutral']


In [25]:
vector = vectorizer.transform(["At Wipro, growth remains at low"])
sentiment = model.predict(vector)
print(sentiment)

['negative']
