In [1]:
import sys
sys.path.append('../Storage')

import pgConn
import PostgresSQL_table_queries
import remoteStorage

import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
pg_conn = pgConn.PgConn("financial_news")
df = pg_conn.get_financial_news()

Connection to the database successful!
Table name set to: financial_news


In [7]:
class TextAnalysis:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.vectorizer = None
        self.tfidf_matrix = None
        
    def term_freq(self):
        # Calculate term frequency
        if self.dataframe is not None:
            self.vectorizer = TfidfVectorizer(use_idf=False)
            self.tfidf_matrix = self.vectorizer.fit_transform(self.dataframe['content'].values.astype('U'))
            return self.tfidf_matrix
        else:
            return None
        
    def doc_freq(self):
        # Calculate document frequency
        if self.vectorizer is not None:
            doc_freq = self.tfidf_matrix.astype(bool).sum(axis=0)
            return doc_freq
        else:
            return None
        
    def tf_Idf(self):
        # Calculate TF-IDF
        if self.vectorizer is not None:
            tfidf_transformer = TfidfVectorizer(use_idf=True)
            self.tfidf_matrix = tfidf_transformer.fit_transform(self.dataframe['content'].values.astype('U'))
            return self.tfidf_matrix
        else:
            return None
    
    def print_term_freq(self):
        # Print term frequency
        if self.vectorizer is not None:
            terms = self.vectorizer.get_feature_names_out()
            for term in terms:
                print(f"Term: {term}")
        else:
            print("No term frequency calculated.")
    
    def print_doc_freq(self):
        # Print document frequency
        if self.vectorizer is not None:
            doc_freq = self.doc_freq()
            terms = self.vectorizer.get_feature_names_out()
            for i, term in enumerate(terms):
                print(f"Term: {term}, Document Frequency: {doc_freq[0, i]}")
        else:
            print("No document frequency calculated.")
    
    def print_tf_Idf(self):
        # Print TF-IDF
        if self.vectorizer is not None:
            terms = self.vectorizer.get_feature_names_out()
            for i, term in enumerate(terms):
                print(f"Term: {term}, TF-IDF: {self.tfidf_matrix[0, i]}")
        else:
            print("No TF-IDF calculated.")

In [8]:
analysis = TextAnalysis(df)
term_freq_matrix = analysis.term_freq()
doc_freq_vector = analysis.doc_freq()
tfidf_matrix = analysis.tf_Idf()

In [9]:
# Print the results
analysis.print_term_freq()

Term: 00
Term: 000
Term: 000th
Term: 000x
Term: 01
Term: 02
Term: 03
Term: 06
Term: 07
Term: 070
Term: 075
Term: 08
Term: 10
Term: 100
Term: 100x
Term: 103
Term: 104
Term: 105
Term: 107
Term: 10x
Term: 11
Term: 110
Term: 115
Term: 12
Term: 122
Term: 125
Term: 13
Term: 13bn
Term: 14
Term: 141
Term: 147
Term: 15
Term: 150
Term: 16
Term: 160
Term: 164
Term: 17
Term: 170
Term: 176
Term: 18
Term: 182
Term: 19
Term: 190
Term: 1930
Term: 1946
Term: 195
Term: 196
Term: 1990
Term: 1m
Term: 20
Term: 200
Term: 2000
Term: 2002
Term: 2005
Term: 2008
Term: 2009
Term: 2011
Term: 2013
Term: 2014
Term: 2018
Term: 2019
Term: 2020
Term: 2021
Term: 2022
Term: 2023
Term: 2024
Term: 2025
Term: 2028
Term: 2030
Term: 21
Term: 210
Term: 214
Term: 2140
Term: 217
Term: 21shares
Term: 22
Term: 23
Term: 237
Term: 23m
Term: 24
Term: 244
Term: 25
Term: 250
Term: 26
Term: 27
Term: 28
Term: 280
Term: 29
Term: 291
Term: 30
Term: 300
Term: 305
Term: 31
Term: 33
Term: 332
Term: 34
Term: 345
Term: 350
Term: 355
Term: 36
T

In [10]:
analysis.print_doc_freq()

Term: 00, Document Frequency: 1
Term: 000, Document Frequency: 23
Term: 000th, Document Frequency: 1
Term: 000x, Document Frequency: 1
Term: 01, Document Frequency: 2
Term: 02, Document Frequency: 1
Term: 03, Document Frequency: 3
Term: 06, Document Frequency: 1
Term: 07, Document Frequency: 1
Term: 070, Document Frequency: 1
Term: 075, Document Frequency: 2
Term: 08, Document Frequency: 1
Term: 10, Document Frequency: 19
Term: 100, Document Frequency: 13
Term: 100x, Document Frequency: 1
Term: 103, Document Frequency: 1
Term: 104, Document Frequency: 1
Term: 105, Document Frequency: 1
Term: 107, Document Frequency: 1
Term: 10x, Document Frequency: 1
Term: 11, Document Frequency: 5
Term: 110, Document Frequency: 1
Term: 115, Document Frequency: 1
Term: 12, Document Frequency: 6
Term: 122, Document Frequency: 1
Term: 125, Document Frequency: 2
Term: 13, Document Frequency: 4
Term: 13bn, Document Frequency: 1
Term: 14, Document Frequency: 10
Term: 141, Document Frequency: 1
Term: 147, Do

In [11]:
analysis.print_tf_Idf()

Term: 00, TF-IDF: 0.0
Term: 000, TF-IDF: 0.013850914949399547
Term: 000th, TF-IDF: 0.0
Term: 000x, TF-IDF: 0.0
Term: 01, TF-IDF: 0.0
Term: 02, TF-IDF: 0.0
Term: 03, TF-IDF: 0.0
Term: 06, TF-IDF: 0.0
Term: 07, TF-IDF: 0.0
Term: 070, TF-IDF: 0.0
Term: 075, TF-IDF: 0.0
Term: 08, TF-IDF: 0.0
Term: 10, TF-IDF: 0.0
Term: 100, TF-IDF: 0.01887653286292354
Term: 100x, TF-IDF: 0.0
Term: 103, TF-IDF: 0.0
Term: 104, TF-IDF: 0.0
Term: 105, TF-IDF: 0.0
Term: 107, TF-IDF: 0.0
Term: 10x, TF-IDF: 0.0
Term: 11, TF-IDF: 0.0
Term: 110, TF-IDF: 0.0
Term: 115, TF-IDF: 0.0
Term: 12, TF-IDF: 0.0
Term: 122, TF-IDF: 0.0
Term: 125, TF-IDF: 0.0
Term: 13, TF-IDF: 0.0
Term: 13bn, TF-IDF: 0.0
Term: 14, TF-IDF: 0.0
Term: 141, TF-IDF: 0.0
Term: 147, TF-IDF: 0.0
Term: 15, TF-IDF: 0.0
Term: 150, TF-IDF: 0.0
Term: 16, TF-IDF: 0.0
Term: 160, TF-IDF: 0.0
Term: 164, TF-IDF: 0.0
Term: 17, TF-IDF: 0.025339455873732354
Term: 170, TF-IDF: 0.0
Term: 176, TF-IDF: 0.0
Term: 18, TF-IDF: 0.0
Term: 182, TF-IDF: 0.0
Term: 19, TF-IDF: 