In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
!ls "/content/drive/My Drive/IR - Project"

Resources


In [103]:
# Define necessary roots and directories
root = '/content/drive/My Drive/IR - Project'
given_data_dir = root + '/Resources/Given-Dataset'
given_emotional_expresions_dir = root + '/Resources/Emotional-Expresions/anger.wne'
given_trust_words_dir = root + '/Resources/Emotional-Expresions/trust_terms.txt'
clean_dataset_dir = root + '/Resources/My-Dataset/Clean-DB'
emotional_expresions_index_dir = root + '/Resources/My-Dataset/Inverted-Indexes/anger_index.json'
trust_terms_index_dir = root + '/Resources/My-Dataset/Inverted-Indexes/trust_terms_index.json'
sentences_documents_index_dir = root + '/Resources/My-Dataset/Inverted-Indexes/sentences_documents_index.json'
trust_emotional_pairs_index_dir = root + '/Resources/My-Dataset/Inverted-Indexes/trust_emotional_pairs_index.json'
summary_dir = root + '/Resources/Summary.json'

In [104]:
# Necessary Imports
import os
import re
import json

In [82]:
# Read the files in the given dataset, clean it from any dirty symbols and characters.

# A function to load all the files pathes in the given directory
def load_all_file_pathes( folder_dir: str ) -> set:
  files_pathes = set( )
  for root, _, files in os.walk( folder_dir ):
    for file_ in files:
      file_path = os.path.join( root, file_ )
      files_pathes.add( file_path ) 
  return files_pathes

# A function to clean the given text 
def clean_file( text: str ) -> str:
  # remove html tags
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', text)

  # remove dirty characters and symbols
  result = ""
  allowed_symbols = ['!', ',', '?', '$', '*', '=', '-', '/', '+', ':', ';', '.', '\n', ' ']
  for ch in cleantext:
    if ch.isalpha() or ch.isdigit() or ch in allowed_symbols:
      result += ch

  # remove multiple spaces
  result = re.sub(' +', ' ', result)
  return result


# a function to read the given file
def read_file( file_path: str ) -> str:
  with open(file_path, 'r') as reader:
    return reader.read()


# a function to write a file as text file
def write_file( file_path: str, text: str ) -> None:
  with open(file_path, 'w') as f:
    f.write( text )


# a function to read all the files in the given directory and
def make_clean_dataset( files_pathes: list ):
  file_number = 1
  all_files = len( files_pathes )
  for file_path in files_pathes:
    print( f'------------------------ File Number { file_number } From { all_files } Files ------------------------' )
    file_number += 1
    text = read_file( file_path )
    text = clean_file( text )
    new_file_path = clean_dataset_dir + file_path[file_path.rfind('/'):]
    write_file( new_file_path, text )


# run the following functions to clean the dataset
files_pathes = load_all_file_pathes( given_data_dir )
read_all_files( list(files_pathes) )

------------------------ File Number 1 From 773 Files ------------------------
------------------------ File Number 2 From 773 Files ------------------------
------------------------ File Number 3 From 773 Files ------------------------
------------------------ File Number 4 From 773 Files ------------------------
------------------------ File Number 5 From 773 Files ------------------------
------------------------ File Number 6 From 773 Files ------------------------
------------------------ File Number 7 From 773 Files ------------------------
------------------------ File Number 8 From 773 Files ------------------------
------------------------ File Number 9 From 773 Files ------------------------
------------------------ File Number 10 From 773 Files ------------------------
------------------------ File Number 11 From 773 Files ------------------------
------------------------ File Number 12 From 773 Files ------------------------
------------------------ File Number 13 From 773 

In [87]:
# make terms-sentences index

def load_emotional_words() -> (dict, dict):
  emo_words = read_file( given_emotional_expresions_dir )
  emo_words = emo_words.replace('\n','').split(' ')
  emo_words = [ elem for elem in emo_words if len(elem) > 0 ]
  emo_words = set( emo_words )
  emo_words_index = { }
  for elem in emo_words:
    elem = elem.casefold()
    emo_words_index[ elem ] = [ ]

  trust_words = read_file( given_trust_words_dir ).split('\n')
  trust_words = [ elem for elem in trust_words if len(elem) > 0 ]
  trust_words = set( trust_words )
  trust_words_index = { }
  for elem in trust_words:
    elem = elem.casefold()
    trust_words_index[ elem ] = [ ]
  
  return trust_words_index, emo_words_index
  

def make_json( file_path: str, data: dict ) -> None:
    with open( file_path , 'w' ) as fp:
        json.dump( data, fp, indent = 6 )


def clean_sentence( sentence: str ) -> list:
  result = ""
  for ch in sentence:
    if ch.isalpha() or ch.isdigit() or ch == ' ':
      result += ch
  return result.split(' ')
   

def make_terms_sentences_index():
  trust_words_index, emo_words_index = load_emotional_words( )
  files_pathes = load_all_file_pathes( clean_dataset_dir )
  sentences_documents_indexes = { }
  file_number = 1
  all_files = len( files_pathes )
  for file_path in files_pathes:
    print( f'------------------------ File Number { file_number } From { all_files } Files ------------------------' )
    file_number += 1
    text = read_file( file_path )
    text = text.replace('\n','')
    sentences = text.split(".")

    for sentence in sentences:
      sentences_documents_indexes[sentence] = file_path[file_path.rfind('/') + 1:]

    for tw in trust_words_index:
      for sentence in sentences:
        words = clean_sentence( sentence )
        if tw in words and sentence not in trust_words_index[tw]:
          trust_words_index[tw].append( sentence )
    
    for ew in emo_words_index:
      for sentence in sentences:
        words = clean_sentence( sentence )
        if ew in words and sentence not in emo_words_index[ew]:
          emo_words_index[ew].append( sentence )
           
  make_json( sentences_documents_index_dir, sentences_documents_indexes )
  make_json( trust_terms_index_dir, trust_words_index )
  make_json( emotional_expresions_index_dir, emo_words_index )

    

make_terms_sentences_index()

------------------------ File Number 1 From 773 Files ------------------------
------------------------ File Number 2 From 773 Files ------------------------
------------------------ File Number 3 From 773 Files ------------------------
------------------------ File Number 4 From 773 Files ------------------------
------------------------ File Number 5 From 773 Files ------------------------
------------------------ File Number 6 From 773 Files ------------------------
------------------------ File Number 7 From 773 Files ------------------------
------------------------ File Number 8 From 773 Files ------------------------
------------------------ File Number 9 From 773 Files ------------------------
------------------------ File Number 10 From 773 Files ------------------------
------------------------ File Number 11 From 773 Files ------------------------
------------------------ File Number 12 From 773 Files ------------------------
------------------------ File Number 13 From 773 

In [105]:
# make summary and statistics

def load_json_file( file_path ):
    with open( file_path ) as json_file:
        return json.load(json_file)


def make_trust_words_summary( trust_words_index: dict ) -> dict:
  temp = { }
  print( f'------------------------ Make Trust-Words Summary ------------------------' )
  for tw, sentences in trust_words_index.items():
    temp[tw] = len( sentences )
  return temp


def make_emotional_words_summary( emo_words_index: dict ) -> dict:
  temp = { }
  print( f'------------------------ Make Emotional-Words Summary ------------------------' )
  for ew, sentences in emo_words_index.items():
    temp[ew] = len( sentences )
  return temp


def make_trust_emotional_pairs_index( trust_words_index: dict, 
            emo_words_index: dict, sentences_documents_index: dict ) -> dict:
  temp = { }
  print( f'------------------------ Make Trust-Emotional Pairs Summary ------------------------' )
  all_sentences_size = len( sentences_documents_index.keys() )
  iteration_number = 1
  for sentence in sentences_documents_index:
    print( f'------------------------ Iteration Number { iteration_number } From { all_sentences_size } Iterations ------------------------' )
    iteration_number += 1
    data = clean_sentence(sentence)
    for ew in emo_words_index:
      for tw in trust_words_index:
        if tw in data and ew in data:
          key = f'{tw}-{ew}'
          if key not in temp:
            temp[key] = []
          if sentence not in temp[key]:
            temp[key].append( sentence )
  make_json(trust_emotional_pairs_index_dir, temp)
  return temp


def make_trust_emotional_pairs_summary( trust_words_index: dict, 
            emo_words_index: dict, sentences_documents_index: dict ) -> dict:
  temp = make_trust_emotional_pairs_index( trust_words_index,  emo_words_index,
                                    sentences_documents_index )      
  summary = { }
  for key, values in temp.items():
    summary[ key ] = len( values )
  
  return summary


def make_summary():
  trust_words_index = load_json_file( trust_terms_index_dir )
  emo_words_index = load_json_file( emotional_expresions_index_dir )
  sentences_documents_index = load_json_file( sentences_documents_index_dir )

  summary = { }
  summary["Trust-Words"] = make_trust_words_summary( trust_words_index )
  summary["Emotional-Words"] = make_emotional_words_summary( emo_words_index )
  summary["Trust-Emotional-Pairs"] = make_trust_emotional_pairs_summary(
                       trust_words_index, emo_words_index,
                       sentences_documents_index )
  summary["Sentences-Numbers"] = len( sentences_documents_index.keys() )
  summary["Files-Numbers"] = 773

  make_json( summary_dir, summary )


make_summary()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
------------------------ Iteration Number 19175 From 24174 Iterations ------------------------
------------------------ Iteration Number 19176 From 24174 Iterations ------------------------
------------------------ Iteration Number 19177 From 24174 Iterations ------------------------
------------------------ Iteration Number 19178 From 24174 Iterations ------------------------
------------------------ Iteration Number 19179 From 24174 Iterations ------------------------
------------------------ Iteration Number 19180 From 24174 Iterations ------------------------
------------------------ Iteration Number 19181 From 24174 Iterations ------------------------
------------------------ Iteration Number 19182 From 24174 Iterations ------------------------
------------------------ Iteration Number 19183 From 24174 Iterations ------------------------
------------------------ Iteration Number 19184 From 24174 Iterations ----------