# Politic_ES

Análisis inicial de los datos

## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Visualización

In [2]:
%matplotlib inline

c_gender = {'male': 'blue', 'female': 'pink'}
c_profession = {'politician': 'gray', 'celebrity': 'yellow', 'journalist': 'orange'}
c_i_bin = {'left': 'red', 'right': 'blue'}
c_i_mul = {'left': 'red', 'moderate_left': 'lightcoral', 'moderate_right': 'turquoise', 'right': 'blue'}

## Seed para repetición

In [3]:
seed = 69420
np.random.seed(seed)

## Dataset

In [6]:
df = pd.read_csv(os.path.join("../practise_data", "politicES_phase_2_train_public.csv"))
df

Unnamed: 0,label,gender,profession,ideology_binary,ideology_multiclass,tweet
0,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,¡Feliz 28 de febrero a todas las andaluzas y a...
1,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,"Feliz año nuevo, feliz esperanza 💕. Querido 20..."
2,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...
3,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,El conflicto en Ucrania ha supuesto una dramát...
4,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,La Academia de la Llingua Asturiana realiza un...
...,...,...,...,...,...,...
179995,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Desde un inicio nos opusimos a la escalda mili...
179996,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,En menos de 4 minutos he tratado de analizar e...
179997,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Un fantasma recorre Euskal Herria y el Estado....
179998,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Aquí os dejo mis reflexiones hoy en el diario ...


In [7]:
# Ordenamos las variables categóricas a nuestro gusto, principalmente por motivos visuales de izquierda a derecha
df.ideology_binary = pd.Categorical(df.ideology_binary, ['left', 'right'])
df.ideology_multiclass = pd.Categorical(df.ideology_multiclass, ['left', 'moderate_left', 'moderate_right', 'right'])

# Analysis de los datos obtenidos

In [8]:
df[["gender", "ideology_binary", "profession",  "label"]].groupby(["gender", "profession", "ideology_binary"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label
gender,profession,ideology_binary,Unnamed: 3_level_1
female,celebrity,left,3040
female,celebrity,right,640
female,journalist,left,22080
female,journalist,right,10720
female,politician,left,14240
female,politician,right,9840
male,celebrity,left,3920
male,celebrity,right,1440
male,journalist,left,36720
male,journalist,right,41280


# Separación en train y test

In [12]:
# Separamos en training y en test
parte_test = 0.3
df_train, df_test = np.split(df, [int((1 - parte_test) * len(df))])

In [13]:
df_train

Unnamed: 0,label,gender,profession,ideology_binary,ideology_multiclass,tweet
0,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,¡Feliz 28 de febrero a todas las andaluzas y a...
1,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,"Feliz año nuevo, feliz esperanza 💕. Querido 20..."
2,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...
3,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,El conflicto en Ucrania ha supuesto una dramát...
4,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,La Academia de la Llingua Asturiana realiza un...
...,...,...,...,...,...,...
125994,b75a1185a34e920e0b23b9e87acde174,female,journalist,left,moderate_left,Pero qué cosa tan hermosa es este episodio sob...
125995,b75a1185a34e920e0b23b9e87acde174,female,journalist,left,moderate_left,@user vaya fichaje que han hecho estos maldito...
125996,b75a1185a34e920e0b23b9e87acde174,female,journalist,left,moderate_left,"Por si quedaban dudas, Lula ganó en el 'Ohio' ..."
125997,b75a1185a34e920e0b23b9e87acde174,female,journalist,left,moderate_left,Las ausencias de los máximos responsables polí...


In [14]:
df_test

Unnamed: 0,label,gender,profession,ideology_binary,ideology_multiclass,tweet
125999,b75a1185a34e920e0b23b9e87acde174,female,journalist,left,moderate_left,Qué petición tan simple la que hace la cantant...
126000,b764da399179ff2948719066391eaecf,female,journalist,right,moderate_right,"He ido a ver Cyrano de Bergerac, de la compañí..."
126001,b764da399179ff2948719066391eaecf,female,journalist,right,moderate_right,Brutal esta entrevista de ⁦@user ⁩ a a una ucr...
126002,b764da399179ff2948719066391eaecf,female,journalist,right,moderate_right,"Cuando llegué a Madrid, mi compañera de piso d..."
126003,b764da399179ff2948719066391eaecf,female,journalist,right,moderate_right,4/ Mientras algunos quieren presentar a Shakir...
...,...,...,...,...,...,...
179995,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Desde un inicio nos opusimos a la escalda mili...
179996,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,En menos de 4 minutos he tratado de analizar e...
179997,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Un fantasma recorre Euskal Herria y el Estado....
179998,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Aquí os dejo mis reflexiones hoy en el diario ...


# PIPELINE Zero-Shot: Hugging Face 

In [16]:
pip install transformers

Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
     ---------------------------------------- 6.8/6.8 MB 17.3 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.11.0-py3-none-any.whl (10.0 kB)
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
     ------------------------------------- 200.1/200.1 kB 11.9 MB/s eta 0:00:00
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp311-cp311-win_amd64.whl (143 kB)
     -------------------------------------- 143.2/143.2 kB 8.3 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp311-cp311-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 31.7 MB/s eta 0:00:00
Installing collected packages: tokenizers, pyyaml, filelock, huggingface-hub, transformers
Successfully installed filelock-3.11.0 huggingface-hub-0.13.4 pyyaml-6.0 tokenizers-0.13.3 transformers-4.27.4
Note: you may need to


[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: C:\Users\manfe\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [17]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
#load clasificator , set device 0 to use GPU
zero_pipeline = pipeline("zero-shot-classification", device=0)

No model was supplied, defaulted to roberta-large-mnli and revision 130fb28 (https://huggingface.co/roberta-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 688/688 [00:00<00:00, 687kB/s]
Downloading tf_model.h5: 100%|██████████| 1.43G/1.43G [00:22<00:00, 62.9MB/s]
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 9.66MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.24MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1

In [19]:
#One Example
zero_pipeline(
    df_test.iloc[0]['tweet'],
    candidate_labels=["left", "right"],
)

{'sequence': 'Qué petición tan simple la que hace la cantante: que haya una figura en los teatros que proteja del acoso.',
 'labels': ['left', 'right'],
 'scores': [0.5974672436714172, 0.40253278613090515]}

In [20]:
#One Example MultiClass
zero_pipeline(
    df_test.iloc[0]['tweet'],
    candidate_labels=['left', 'moderate_left', 'moderate_right', 'right'],
)

{'sequence': 'Qué petición tan simple la que hace la cantante: que haya una figura en los teatros que proteja del acoso.',
 'labels': ['left', 'right', 'moderate_left', 'moderate_right'],
 'scores': [0.4279981553554535,
  0.2883560359477997,
  0.14735886454582214,
  0.13628685474395752]}

In [21]:
#Accuracy in Binary Clasification
good = 0
bad = 0
for i in range(len(df_train)):
  _dic = zero_pipeline(
      df_train.iloc[i]['tweet'],
      candidate_labels=["left", "right"],
  )
  scores = _dic['scores']
  labels = _dic['labels']
  max_value = max(scores)
  index = scores.index(max_value)
  prediction = labels[index]
  if prediction == df_train.iloc[i]['ideology_binary']:
    good = good + 1
  else:
    bad = bad + 1

print(f'Aciertos : {good}')
print(f'Desaciertos : {bad}')
print(f'Porciento : {good/(good + bad)}')

KeyboardInterrupt: 

In [None]:
#Accuracy in Multi-Clasification
good = 0
bad = 0
for i in range(len(df_train)):
  _dic = zero_pipeline(
      df_train.iloc[i]['tweet'],
      candidate_labels=['left', 'moderate_left', 'moderate_right', 'right'],
  )
  scores = _dic['scores']
  labels = _dic['labels']
  max_value = max(scores)
  index = scores.index(max_value)
  prediction = labels[index]
  if prediction == df_train.iloc[i]['ideology_multiclass']:
    good = good + 1
  else:
    bad = bad + 1

print(f'Aciertos : {good}')
print(f'Desaciertos : {bad}')
print(f'Porciento : {good/(good + bad)}')