In [None]:
import nltk
from nltk import FreqDist
from collections import defaultdict
from nltk.util import ngrams
import string
import re
from nltk.corpus import stopwords
import csv
from nltk.tokenize import MWETokenizer
from nltk.chunk import RegexpParser
from nltk.parse import DependencyGraph

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('dependency_treebank')


# Adding digits, lowercase letters to stop words
stop_words = set(stopwords.words('english'))
stop_words.update([str(i) for i in range(10)])  # Adding digits
stop_words.update(list(string.ascii_lowercase))  # Adding individual lowercase letters

# Add stop words from stoplist.txt
with open('stoplist.txt', 'r') as f:
    stoplist_words = f.read().splitlines()
stop_words.update(stoplist_words)

def pretagging(txt):
    rules = [
        # Remove commas inside numbers
        [r'(\d),(\d)', r'\1\2'],
        # Replace periods inside web domains with <dot>
        [r'(\w)\.(\w)', r'\1<dot>\2'],
        # Insert white space in front of a unit where necessary
        [r'(\d)([a-z]+)', r'\1 \2'],
        # Compress repetitive punctuation into a single character
        [r'([!?.-])\1+', r'\1'],
        # Normalize whitespace
        [r'\s+', ' '],
        # Normalize non-ASCII characters
        [r'[^\x00-\x7F]+', ' ']
    ]
    # Apply all rules
    for pattern, replacement in rules:
        txt = re.sub(pattern, replacement, txt)
    return txt

def hyphen(txt):
    txt = re.sub('([a-z])\\-([a-z])', '\\1 \\2', txt, flags=re.IGNORECASE)
    txt = re.sub('([a-z])\\-([a-z])', '\\1 \\2', txt, flags=re.IGNORECASE)
    return txt

def process_text(text):
    # Apply pre-processing
    text = pretagging(text)
    text = hyphen(text)
    # Remove punctuation and convert to lower case
    text = re.sub(r'[^\w\s<dot>]', ' ', text.lower())
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Replace <dot> with . in tokens
    tokens = [token.replace('<dot>', '.') if '<dot>' in token else token for token in tokens]
    # Filter out stop words and tokens containing 'dot'
    tokens = [token for token in tokens if token not in stop_words and 'dot' not in token and len(token) > 1]
    return tokens


def process_dataset(filename):
    bigram_freq = FreqDist()
    first_word_freq = defaultdict(int)
    second_word_freq = defaultdict(int)
    bigram_pos = defaultdict(str)  # For storing POS tags of bigrams
    total_words = 0

    with open(filename, 'r') as file:
        for line in file:
            tokens = process_text(line)
            if tokens:
                bigrams = list(ngrams(tokens, 2))
                bigram_freq_line = FreqDist(bigrams)
                bigram_freq.update(bigram_freq_line)
                total_words += len(tokens)

                for bigram, freq in bigram_freq_line.items():
                    first_word, second_word = bigram
                    first_word_freq[first_word] += freq
                    second_word_freq[second_word] += freq
                    tagged_bigram = nltk.pos_tag(bigram)  # Perform POS tagging
                    # Override POS tag for 'data' and 'web'
                    tagged_bigram = [(word, 'NN') if word in ['data', 'web'] else (word, tag) for word, tag in tagged_bigram]
                    bigram_pos[bigram] = tagged_bigram


    # Filter out bigrams with frequency less than 50
    bigram_freq = {bigram: freq for bigram, freq in bigram_freq.items() if freq >= 50}

    return bigram_freq, first_word_freq, second_word_freq, bigram_pos, total_words

bigram_freq, first_word_freq, second_word_freq, bigram_pos, total_words = process_dataset("AI_5_only_english.txt")
print(bigram_freq)
# Remove duplicates
unique_bigrams = set(bigram_freq.keys())

# POS tagging and filter bigrams based on patterns
selected_bigrams = []
for bigram in unique_bigrams:
    tagged_bigram = bigram_pos[bigram]  # Retrieve the POS tag from the dictionary
    if (
        (tagged_bigram[0][1] == 'NN' and tagged_bigram[1][1] == 'NN') or  # Pattern 1
        (tagged_bigram[0][1] == 'NN' and tagged_bigram[1][1] == 'NNS') or  # Pattern 2
        (tagged_bigram[0][1] == 'JJ' and tagged_bigram[1][1] == 'NN') or  # Pattern 3
        (tagged_bigram[0][1] == 'JJ' and tagged_bigram[1][1] == 'NNS')  # Pattern 4
    ):
        selected_bigrams.append((bigram, tagged_bigram))  # Store the bigram and its POS tag

# Rank bigrams
ranked_bigrams = sorted(selected_bigrams)

# Write the output to a CSV file
with open('output4.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Bigram", "POS Tag", "Frequency", "First Word Frequency", "Second Word Frequency"])
    for bigram, tagged_bigram in ranked_bigrams:
        writer.writerow([bigram, tagged_bigram, bigram_freq[bigram], first_word_freq[bigram[0]], second_word_freq[bigram[1]]])


