# Political bias train 



To print the result, you can use the following snippet:

In [20]:
def print_prediction_results(docs, tc):
    for doc in docs:
        result = tc.predict([doc])
        print(f"{doc.doc}")
        max_score = -1
        max_label = ""
        for l,r in zip(tc.labels, result[0]):
            if r > max_score:
                max_score = r
                max_label = l
            print("{}   \t{:.2f}%".format(l, r*100))
        print(f"sentiment: {max_label}")
        print("===========================")


python -m spacy download en_core_web_lg 

pip install spacy-lookups-data

In [16]:
MODEL_DIR = './NLP/spacy_model'
MODEL_BEST = './NLP/spacy_model/model-best/'
TEST = './NLP/political_test.spacy'
TRAIN = './NLP/political_train.spacy'
CONFIG = './NLP/config.cfg'

### Imports 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.cli.train import train
import spacy
from spacy.tokens import DocBin
from sklearn.metrics import classification_report

In [4]:
print("spaCy version:", spacy.__version__)

spaCy version: 3.7.4


### Read financial sentiment 

In [10]:
def replace_label(df):
    label_map = {1: 'left', 0: 'right'}
    df['label'] = df['label'].map(label_map)

In [None]:
## https://huggingface.co/datasets/JyotiNayak/political_ideologies

In [11]:
df_train = pd.read_parquet("Train_dataset/political_bias_train.parquet")
replace_label(df_train)
df_train.head()

Unnamed: 0,statement,label,issue_type,__index_level_0__
0,"Climate change, and the escalating environment...",left,1,465
1,I believe in the foundational importance of th...,right,2,1191
2,I firmly believe that the principle of separat...,left,6,2440
3,I firmly believe in the separation of church a...,left,6,2406
4,I firmly believe in the power of free markets ...,right,0,1903


In [12]:
df_test = pd.read_parquet("Train_dataset/political_bias_test.parquet")
replace_label(df_test)
df_test.head()

Unnamed: 0,statement,label,issue_type,__index_level_0__
0,While respecting individual rights is paramoun...,right,7,1777
1,The continuous economic dependence on China ha...,right,3,1342
2,I firmly believe in the sanctity and tradition...,right,2,2700
3,While I recognize and empathize with the chall...,right,5,3100
4,I firmly believe in preserving the integrity o...,right,6,984


In [71]:
df_validation = pd.read_parquet("Train_dataset/Validation_dataset/political_bias_train.parquet")
replace_label(df_validation)
df_validation.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Train_dataset/Validation_dataset/political_bias_train.parquet'

### Convert train and test to docbin 

In [84]:
nlp = spacy.blank("en")

def df2docbin(df, nlp):
    docbin = DocBin()
    for _, row in df.iterrows():
        doc = nlp.make_doc(row['statement'])
        cats = {}
        for label in ['right', 'left']:
            cats[label] = 1.0 if row['label'].lower() == label else 0.0
        doc.cats = cats
        docbin.add(doc)
    return docbin


docbin_train = df2docbin(df_train, nlp)
docbin_test = df2docbin(df_test, nlp)

docbin_train.to_disk(TRAIN)
docbin_test.to_disk(TEST)

### Train the model 

In [87]:
train(output_path=MODEL_DIR, use_gpu=-1, config_path=CONFIG)

[38;5;4mℹ Saving to output directory: NLP/spacy_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  CATS_MICRO_P  CATS_MICRO_R  SCORE 
---  ------  ------------  ----------  ------------  ------------  ------
  0       0          0.25       33.33         50.00         50.00    0.44
  0     200         43.39       77.33         77.50         77.50    0.77
  0     400         28.98       90.31         90.31         90.31    0.89
  0     600         16.54       82.13         82.50         82.50    0.82
  0     800         16.41       88.65         88.75         88.75    0.88
  0    1000          9.71       94.37         94.38         94.38    0.93
  1    1200          9.94       91.84         91.88         91.88    0.91
  1    1400          6.08       94.68         94.69         94.69    0.94
  1    1600          4.27       95.00         95.00 

### Evaluate the model

In [18]:
nlp = spacy.load(MODEL_BEST)
test_data = DocBin().from_disk(TEST)
test_docs = list(test_data.get_docs(nlp.vocab))

def evaluate(nlp, docs):
    true_labels = []
    predicted_labels = []
    
    for doc in docs:
        true_labels.append(max(doc.cats, key=doc.cats.get))  
        pred_doc = nlp(doc.text)  
        predicted_labels.append(max(pred_doc.cats, key=pred_doc.cats.get))  

    report = classification_report(true_labels, predicted_labels, output_dict=True)
    return pd.DataFrame(report).transpose()  

eval_results = evaluate(nlp, test_docs)

print("Evaluation Results:")
print(eval_results.round(2)) 

Evaluation Results:
              precision  recall  f1-score  support
left               0.99    0.99      0.99   160.00
right              0.99    0.99      0.99   160.00
accuracy           0.99    0.99      0.99     0.99
macro avg          0.99    0.99      0.99   320.00
weighted avg       0.99    0.99      0.99   320.00


### Evaluate the new sentences 

In [62]:
loaded_model = spacy.load(MODEL_BEST)

textcat = loaded_model.get_pipe('textcat')
docs = [ loaded_model.make_doc(sentence) for sentence in df_test['statement'] ]
print_prediction_results(docs, textcat)

While respecting individual rights is paramount, unchecked immigration can put undue stress on our economy, infrastructure, and social services. A well-regulated immigration system, prioritizing law and order, is integral to maintain a stable society. It is essential to ensure that those who wish to call the United States home can contribute meaningfully to our nation's growth and respect the country's principles and values.
right   	95.07%
left   	4.93%
sentiment: right
The continuous economic dependence on China has led to an unsettling imbalance in global power dynamics. It's crucial to diversify our sources of imports and promote domestic industries to ensure national security and economic stability. A comprehensive review of international trade agreements is, therefore, an urgent necessity to protect our national interests.
right   	91.50%
left   	8.50%
sentiment: right
I firmly believe in the sanctity and traditional structure of the family unit as the pillar of our society. When

In [37]:
df_mad_max_furiosa = pd.read_csv("Data/tt12037194/tt12037194_review.csv")
df_mad_max_furiosa.head()


Unnamed: 0,review_title,review_body,rating,reviewer_name,review_date
0,Do you have it in you to make it epic?,"George Miller... ""Why yes, yes I do. Well sort...",7.0,jethro-17881,23 May 2024
1,"Good start, ending could have been better",Pretty good movie. Enough action and with enou...,7.0,alwinsup,23 May 2024
2,15 years of Wasteland,It would be folly to try and outdo Fury Road a...,9.0,masonsaul,25 May 2024
3,The fifth rider of the apocalypse,,8.0,hemipristiss,22 May 2024
4,It was great!,I just finished watching the movie premiere he...,9.0,jinxedmihai,22 May 2024


In [91]:
df_the_first_purge = pd.read_csv("Data/tt6133466/tt6133466_review.csv")
df_the_first_purge.head()

Unnamed: 0,review_title,review_body,rating,reviewer_name,review_date
0,A painful way to spend your time,,4.0,Leofwine_draca,6 February 2020
1,"Weakest of The Purge films, but it remains a f...","This fourth installment in the fascinating, bu...",6.0,a_chinn,1 October 2018
2,"Had Potential, Falls short",,5.0,pcytoman,4 July 2018
3,Worst Purge movie out of all of them,The characters are underdeveloped and the acti...,3.0,DeepcavernImp,24 July 2021
4,Really amusing movie.,I haven't seen the original Purge movies. And ...,7.0,gerben188,4 August 2018


In [98]:
import numpy as np
import pandas as pd

def predict_and_sort(df, model, nlp, confidence_threshold=0.8):
    df_filtered = df[df['rating'].isin([1, 10])]
    
    docs_with_ratings = [
        (nlp(title + " " + title + " " + body), rating, title) 
        for title, body, rating in zip(df_filtered['review_title'], df_filtered['review_body'], df_filtered['rating']) 
        if pd.notna(title) and pd.notna(body)
    ]
    
    results = []
    for doc, rating, title in docs_with_ratings:
        if doc is not None:
            prediction_scores = model.predict([doc])[0]
            max_index = np.argmax(prediction_scores)
            max_score = prediction_scores[max_index]
            max_label = model.labels[max_index]
            
            doc_result = {
                "review_title": title,
                "document": doc.text,
                "predicted_sentiment": max_label,
                "confidence": max_score,
                "rating": rating,
                
            }
            results.append(doc_result)
    
    df_predicted = pd.DataFrame(results)
    
    high_confidence_df = df_predicted[df_predicted['confidence'] > confidence_threshold]
    high_confidence_df['text_length'] = high_confidence_df['document'].apply(len)
    
    sorted_df = high_confidence_df.sort_values(by='text_length', ascending=True).head(10)
    return sorted_df

# Example usage
sorted_df = predict_and_sort(df_the_first_purge, textcat, nlp, confidence_threshold=0.95)
sorted_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_confidence_df['text_length'] = high_confidence_df['document'].apply(len)


Unnamed: 0,review_title,document,predicted_sentiment,confidence,rating,text_length
135,Classism at its best great movie,Classism at its best great movie Classism at i...,right,0.984741,10.0,135
153,I reckon it was great.,I reckon it was great. I reckon it was great. ...,right,0.982804,10.0,180
128,The best of them all,The best of them all The best of them all I'll...,right,0.968937,10.0,226
34,Worst Purge Movie...,Worst Purge Movie... Worst Purge Movie... I am...,right,0.951106,1.0,258
64,Hands down the best Purge movie,Hands down the best Purge movie Hands down the...,right,0.973648,10.0,366
112,What a waste of a free movie ticket.,What a waste of a free movie ticket. What a wa...,right,0.960176,1.0,385
35,This is NOT what the Purge was about!,This is NOT what the Purge was about! This is ...,right,0.973193,1.0,396
1,"Horrible, racist garbage","Horrible, racist garbage Horrible, racist garb...",left,0.974399,1.0,500
95,After this movie all races can agree on one th...,After this movie all races can agree on one th...,left,0.950983,1.0,514
43,A once watchable franchise murdered on the tab...,A once watchable franchise murdered on the tab...,right,0.969823,1.0,535
