In [None]:
!pip install gradio

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import pandas as pd
from nltk.util import ngrams
import pickle
import torch
from torch.autograd import Variable
import sys
!pip3 install skipthoughts
from skipthoughts import BiSkip
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

In [None]:
!pip3 install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def generate_dict():
    f = open("../input/dictionary1/dictionary.txt", "r")
    data = f.read()
    data_lst = data.replace('\n', ' ').split(" ")
    vocab = list(set(data_lst))

    w2i = {}
    for i in range(len(vocab)):
        w2i[vocab[i]] = i+1
    return vocab, w2i

In [None]:
vocab, w2i = generate_dict()

In [None]:
dir_st = 'skipthoughts'
biskip = BiSkip(dir_st, vocab)

In [None]:
def external_features(headline, body):
    vec = []

    # Character n-grams
    for n in range(2, 17):
        n_grams_1 = list(ngrams(headline.lower(), n,pad_right=True, right_pad_symbol='</s>'))
        n_grams_2 = list(ngrams(body.lower(), n,pad_right=True, right_pad_symbol='</s>'))
        intersection = len(set(n_grams_1).intersection(n_grams_2))
        vec.append(intersection)
        temp_c1 = len(set(n_grams_1).intersection(ngrams(body.lower()[:255], n,pad_right=True, right_pad_symbol='</s>')))
        temp_c2 = len(set(n_grams_1).intersection(ngrams(body.lower()[:100], n,pad_right=True, right_pad_symbol='</s>')))
        vec.append(temp_c1)
        vec.append(temp_c2)

    # Word n-grams
    for n in range(2, 7):
        n_grams_1 = list(ngrams(headline.lower().split(), n,pad_right=True, right_pad_symbol='</s>'))
        n_grams_2 = list(ngrams(body.lower().split(), n,pad_right=True, right_pad_symbol='</s>'))
        intersection = len(set(n_grams_1).intersection(n_grams_2))
        vec.append(intersection)
        temp_c = len(set(n_grams_1).intersection(ngrams(body.lower()[:255].split(), n, pad_right=True, right_pad_symbol='</s>')))
        vec.append(temp_c)

    # Number of common words between headline and body with respect to total words
    words_set1 = set(headline.split())
    words_set2 = set(body.split())
    common_words = len(words_set1.intersection(words_set2)) / max(len(words_set1), len(words_set2))
    vec.append(common_words)

    sid_obj = SentimentIntensityAnalyzer()
    d1 = sid_obj.polarity_scores(headline)
    d2 = sid_obj.polarity_scores(body)

    # Calculate sentiment differences and add them to the feature vector
    sentiment_diffs = [abs(d1[key] - d2[key]) for key in ['neg', 'neu', 'pos', 'compound']]
    vec.extend(sentiment_diffs)

    

    vec = np.array(vec)
    return vec

In [None]:
import nltk
nltk.download('punkt',download_dir='/usr/local/share/nltk_data')
nltk.download('wordnet',download_dir='/usr/local/share/nltk_data')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
!unzip ../../usr/local/share/nltk_data/corpora/wordnet.zip -d ../../usr/local/share/nltk_data/corpora/
!ls -r ../../usr/local/share/nltk_data/corpora/

In [None]:

_wnl = nltk.WordNetLemmatizer()
def normalize_word(word):
    return _wnl.lemmatize(word).lower()

def get_tokenized_lemmas(s):
    list=nltk.word_tokenize(s)
    tokenized_list=[]
    for token in list:
        tokenized_list.append(normalize_word(token))
    return tokenized_list

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    list=[]
    for word in l:
        if word not in feature_extraction.text.ENGLISH_STOP_WORDS:
            list.append(word)
    return list

def preprocess(headlines,bodies):
    n_headlines, n_bodies =[],[]
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = get_tokenized_lemmas(clean(headline))
        clean_body = get_tokenized_lemmas(clean(body))
        clean_headline = remove_stopwords(clean_headline)
        clean_body = remove_stopwords(clean_body)
        n_headlines.append(clean_headline)
        n_bodies.append(clean_body)
    n_headlines_df=pd.DataFrame({'Headline':n_headlines})
    n_bodies_df=pd.DataFrame({'Body':n_bodies})
    return n_headlines_df['Headline'].apply(lambda x:' '.join(x)), n_bodies_df['Body'].apply(lambda x:' '.join(x))

The following commented cells dump the vectorizer to a pkl file, do that we could reuse them

In [None]:
# df = pd.read_csv('../input/dataset2/train_Set.csv')
# stop_words_l=stopwords.words('english')

# df['Headline'], df['Body'] = preprocess(df['Headline'],df['Body'])

In [None]:

# headline_vectorizer = TfidfVectorizer(stop_words=stop_words_l,max_features=5000, ngram_range=(1, 1))
# headline_vectorizer.fit(df['Headline'])
# vocab_sz_headline = len(headline_vectorizer.vocabulary_)
# with open('h_vectorizer.pkl', 'wb') as file:
#     pickle.dump(headline_vectorizer, file)

In [None]:
# body_vectorizer = TfidfVectorizer(stop_words=stop_words_l,max_features=10000-vocab_sz_headline)
# body_vectorizer.fit(df['Body'])
# with open('b_vectorizer.pkl', 'wb') as file:
#     pickle.dump(body_vectorizer, file)

The required vectorizers are loaded from the respective pickle files

In [None]:
with open('../input/vectorizers/h_vectorizer.pkl', 'rb') as file:
    headline_vectorizer = pickle.load(file)
with open('../input/vectorizers/b_vectorizer.pkl', 'rb') as file:
    body_vectorizer = pickle.load(file)

In [None]:
def statistical_features(headline, body):
    headlines,bodies = preprocess([headline],[body])
    h = headline_vectorizer.transform(headlines)

    b = body_vectorizer.transform(bodies)
    
    statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)
    return statistical_features

In [None]:
def neural_features(headline, body):

    heads, bodies = [headline, headline], [body, body]
#     print(headline)
    MAX_HEADLINE_COUNT = len(headline.split())
    MAX_BODY_COUNT = len(body.split())
    
    head_ids = np.zeros((len(heads), MAX_HEADLINE_COUNT + 1), dtype=np.float32)
    bdy_ids = np.zeros((len(bodies), MAX_BODY_COUNT + 1), dtype=np.float32)
    
    head_encodings = np.zeros((len(heads), 2400), dtype=np.float32)
    bdy_encodings = np.zeros((len(bodies), 2400), dtype=np.float32)
    BATCH_SZ = 200
    
    for i in range(len(heads)):
        hl = heads[i].split()
        body = bodies[i].split()

        for j, word in enumerate(hl):
            try:
                head_ids[i][j] = w2i[word]
            except KeyError:
                pass
        for j, word in enumerate(body):
            try:
                bdy_ids[i][j] = w2i[word]
            except KeyError:
                pass

#     rem = len(heads) % BATCH_SZ
    
#     if rem != 0:
#         print(rem)
#     print()
    h = torch.LongTensor(head_ids[0:2])
    b = torch.LongTensor(bdy_ids[0:2])
#     print(h.shape)
    h_tmp = biskip(h).detach().numpy()
    b_tmp = biskip(b).detach().numpy()

    head_encodings[0:2] = h_tmp
    bdy_encodings[0:2] = b_tmp

    
    feat1 = np.zeros((len(heads), 2400), dtype=np.float32)
    feat2 = np.zeros((len(heads), 2400), dtype=np.float32)
    
    for j, (h_vector, b_vector) in enumerate(zip(head_encodings, bdy_encodings)):
        feat1[j] = np.multiply(h_vector, b_vector)
        feat2[j] = np.absolute(h_vector - b_vector)

    
    features = np.concatenate((feat1, feat2), axis=1)
    return features[0]

In [None]:
class Model(nn.Module):
    def __init__(self, inp_dim_nf=4800, inp_dim_ef=60, inp_dim_sf=10000, out_dim=4):
        super(Model, self).__init__()
        out_dim_nf = 100
        out_dim_ef = 50
        out_dim_sf = 50
        
        # Neural network for input Neural Features
        self.model_nf = nn.Sequential(
            nn.Linear(inp_dim_nf, 500),
            nn.Sigmoid(),
            nn.Dropout(0.2),
            nn.Linear(500, out_dim_nf),
            nn.Sigmoid()
        )

        # Neural network for input External Features
        self.model_ef = nn.Sequential(
            nn.Linear(inp_dim_ef, out_dim_ef),
            nn.ReLU()
        )
        
        # Neural network for input Statistical Features
        self.model_sf = nn.Sequential(
            nn.Linear(inp_dim_sf, 500),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(500, out_dim_sf),
            nn.ReLU()
        )

        # Feature Combined model
        self.fc = nn.Sequential(
            nn.BatchNorm1d(out_dim_nf + out_dim_ef + out_dim_sf),
            nn.Linear(out_dim_nf + out_dim_ef + out_dim_sf, out_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x_nf, x_ef, x_sf):
        h_nf = self.model_nf(x_nf)
        h_ef = self.model_ef(x_ef)
        h_sf = self.model_sf(x_sf)
        
        l2_reg_nf = torch.tensor(0.0)
        for name, param in self.model_nf.named_parameters():
            if 'weight' in name:
                l2_reg_nf += torch.norm(param, p=2)
        
        l2_reg_sf = torch.tensor(0.0)
        for name, param in self.model_sf.named_parameters():
            if 'weight' in name:
                l2_reg_sf += torch.norm(param, p=2)

        # Concatenate the outputs
        h = torch.cat((h_nf, h_ef, h_sf), dim=1)
        # Final prediction
        o = self.fc(h)

        return o, l2_reg_nf, l2_reg_sf

In [None]:
model = Model()

In [None]:
model.load_state_dict(torch.load("../input/fncmodel/fncmodel.pth", map_location=torch.device('cpu')))

In [None]:
def preprocess1(s):
    s = s.lower()
    s = s.replace("\n","")
    s = s.replace("\'s"," is")
    s = s.replace("n\'t"," not")
    s = s.replace("\'d"," would")
    return s

In [None]:
def classify(head, body):
    head = preprocess1(head)
    body = preprocess1(body)
    X_sf = statistical_features(head,body)[0]
    X_ef = external_features(head,body)
    X_nf = neural_features(head, body)
    X_nf = torch.from_numpy(X_nf).float()
    X_ef = torch.from_numpy(X_ef).float()
    X_sf = torch.from_numpy(X_sf).float()
    with torch.no_grad():
        model.eval()
        out, l2_reg_nf, l2_reg_sf = model(X_nf.unsqueeze(0), X_ef.unsqueeze(0), X_sf.unsqueeze(0))
    class_labels = ['agree', 'disagree', 'discuss', 'unrelated']
    return class_labels[out.argmax(dim=1)]

In [None]:
head = "Argentine President Takes On Godson — But Not To Keep Werewolf At Bay"
body = '''"Nope. Argentina’s President Cristina Fernández de Kirchner has not become godmother of a Jewish baby to stop him from becoming a werewolf – despite what you may have read in multiple news reports.'''

In [None]:
classify(head, body)

In [None]:
!pip3 install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator,LANGUAGES
def translate(txt, src, dest):
    translator = Translator()
    translated = translator.translate(txt,src=src,dest=dest)
    return translated.text

In [None]:
def classifier(head, body, lang):
    # Placeholder function for translation
    # Implement the actual translation logic here
    if lang == 'Telugu':
        head = translate(head, 'te', 'en')
        body = translate(body, 'te', 'en')
        return classify(head, body)
    else:
        return classify(head, body)

In [None]:
head = 'బ్యాంకులకు వరుసగా 3 రోజులు సెలవు.. డిసెంబర్ లిస్ట్ ఇదే.. పనులుంటే ముందే చూస్కోండి!'
body = 'బ్యాంక్ కస్టమర్లకు ముఖ్యమైన అలర్ట్. ఈ నెలలో బ్యాంకులకు వరుసగా సెలవులు ఉన్నాయి. పలు పండగల నేపథ్యంలో అన్ని ప్రాంతాల్లో కలుపుకొని 18 రోజులు బ్యాంకులు పనిచేయకపోవచ్చు. ఇవి ప్రాంతాల్ని బట్టి మారతాయని తెలిసిందే. అయితే మీకు బ్యాంకులో ఏదైనా పని ఉంటే.. ఆర్‌బీఐ సెలవుల క్యాలెండర్ చూసుకొని వెళ్తే మంచిదిు'
print(translate(head, 'te', 'en'))
print()
print(translate(body, 'te', 'en'))
print()
print(classifier(head, body, 'Telugu'))

In [None]:
head = 'నేడే ఇండియా, ఆస్ట్రేలియా నాలుగో టీ20.. భారమంతా బౌలర్ల చేతిలోనే.. జట్టులోకి పెళ్లికొడుకు!'
body = 'ఇండియా, ఆస్ట్రేలియా టీ20 సమరంలో మరో కీలకమ్యాచ్‌కు రెండు జట్లు సిద్ధమయ్యాయి. ఐదు మ్యాచ్‌ల సిరీస్‌లో టీమిండియా రెండు, ఆస్ట్రేలియా మూడో టీ20లో గెలుపొందాయి. సిరీస్‌లో 2-1 ఆధిక్యంలో ఉన్న టీమిండియా.. నాలుగో టీ20లో గెలిచి సిరీస్ కైవసం చేసుకోవాలని పట్టుదలగా ఉంది. మరోవైపు మూడో టీ20 ప్రదర్శనను రిపీట్ చేసి సిరీస్ సమం చేయాలని కంగారూలు భావిస్తున్నారు. ఈ నేపథ్యంలో ఇవాళ (శుక్రవారం) రాయ్‌పూర్ వేదికగా జరగనున్న నాలుగో టీ20 మ్యాచ్‌ రెండు జట్లకు ప్రతిష్టాత్మకంగా మారిందిు'
print(translate(head, 'te', 'en'))
print()
print(translate(body, 'te', 'en'))
print()
print(classifier(head, body, 'Telugu'))

In [None]:
head = 'No Electricity At Stadium Hosting India Vs Australia T20 Today. Bill Not Paid'
body = '''The stadium has an outstanding bill of ₹ 3.16 crore, due to which the electricity connection at the stadium had been cut 5 years ago.'''

# print(translate(head, 'te', 'en'))
# print()
# print(translate(body, 'te', 'en'))
# print()
print(classifier(head, body, 'English'))

In [None]:
head = 'No Electricity At Stadium Hosting India Vs Australia T20 Today. Bill Not Paid'
body = '''The stadium has an outstanding bill of ₹ 3.16 crore, due to which the electricity connection at the stadium had been cut 5 years ago.'''

print(classifier(head, body, 'English'))

h = (translate(head, 'en', 'te'))
print()
print(head)
print(h)
print((translate(h, 'te', 'en')))
print()
b = (translate(body, 'en', 'te'))
print(body)
print(b)
print((translate(b, 'te', 'en')))
print()
print(classifier(h, b, 'Telugu'))