# Translate CrowdTangle Hindi columns to English

## Installs and imports

In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.0 MB 24.8 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.3/1.3 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully ins

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import MarianTokenizer, MarianMTModel
from sklearn.metrics import classification_report
from collections import defaultdict
from matplotlib import pyplot as plt
from datasets import load_dataset
import torch
import numpy as np
import urllib
import pandas as pd

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
max_length = 512

## Load the `CrowdTangle` dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv (
    '/content/drive/My Drive/The-London-Story/CrowdTangle-TLS-workspace/2022-07-11-16-34-58-CEST-Historical-Report-2016-12-31--2022-07-11.csv',
    low_memory=False,  
    lineterminator='\n', 
    sep=';', 
    error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df.shape

(604703, 40)

Because the size of the dataset is too large, let's choose first N values from the dataframe

In [None]:
df = df.head(500)

In [None]:
df.columns

Index(['account.name', 'account.handle', 'platformId', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'subscriberCount', 'Followers at Posting', 'date', 'Post Created Date',
       'Post Created Time', 'type', 'totalInteraction',
       'statistics.actual.likeCount', 'statistics.actual.commentCount',
       'statistics.actual.shareCount', 'statistics.actual.loveCount',
       'statistics.actual.wowCount', 'statistics.actual.hahaCount',
       'statistics.actual.sadCount', 'statistics.actual.angryCount',
       'statistics.actual.careCount', 'Video Share Status', 'Is Video Owner?',
       'statistics.actual.videoPostViewCount',
       'statistics.actual.videoTotalViewCount',
       'statistics.actual.videoAllCrosspostsViewCount', 'Video Length',
       'postUrl', 'message', 'expandedLinks.original',
       'expandedLinks.expanded', 'imageText', 'title', 'description',
       'brandedContentSponsor.platformId', 'brandedContentSponsor.name',
 

## Data preprocessing

Replace NaN values in particular columns with empty string

In [None]:
df.message = df.message.fillna('')
df.description = df.description.fillna('')
df.imageText = df.imageText.fillna('')

Extract particular columns and convert into lists

In [None]:
accountname_list =  df ['account.name'].tolist()
message_list = df ['message'].tolist()
description_list = df ['description'].tolist()
imageText_list = df ['imageText'].tolist()
expandedLinks_list = df['expandedLinks.original'].tolist()

print ("Total number of entries in the dataset:", len(message_list))

Total number of entries in the dataset: 500


## Load the translation model and the tokenizer


Load the translation model

In [None]:
src = "hi"  # source language
trg = "en"  # target language

# loading the translation model
tr_model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
tr_model = MarianMTModel.from_pretrained(tr_model_name)
tr_model= tr_model.to(device)

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]

Load the tokenizer

In [None]:
# loading the tokenizer of the translation model
tr_tokenizer = MarianTokenizer.from_pretrained(tr_model_name)

Downloading:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/813k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]



Define function to translate the text

In [None]:
def translate_hi_to_en (TEXT):
  return tr_tokenizer.batch_decode (tr_model.generate(**tr_tokenizer(TEXT, truncation=True, padding=True, max_length=max_length, return_tensors="pt").to(device)),skip_special_tokens=True)[0]

## Translate the dataset

In [None]:
# declare an empty dataframe to populate with the translations
output_df = pd.DataFrame (columns=
                         ['accountname',
                          'message', 
                          'description', 
                          'imageText', 
                          'message_translation',
                          'description_translation',
                          'imageText_translation',
                          'link'
                          ], 
                          index=range(len(accountname_list)))

 **Translate the dataset**

In [None]:
N = len(message_list)


for i in range (N):
  
  if i % 100 == 0:
    print('Example', i+1,' of ', N)

  output_df.loc[i].accountname = accountname_list [i]
  output_df.loc[i].message = message_list [i]
  output_df.loc[i].description = description_list [i]
  output_df.loc[i].imageText = imageText_list [i]
  output_df.loc[i].message_translation = translate_hi_to_en (message_list [i])
  output_df.loc[i].description_translation = translate_hi_to_en (description_list [i])
  output_df.loc[i].imageText_translation = translate_hi_to_en (imageText_list [i])
  output_df.loc[i].link = expandedLinks_list [i]

Example 1  of  500
Example 101  of  500
Example 201  of  500
Example 301  of  500
Example 401  of  500


In [None]:
output_df.tail()

Unnamed: 0,accountname,message,description,imageText,message_translation,description_translation,imageText_translation,link
495,Sudarshan News,‡§Æ‡•Å‡§Ç‡§¨‡§à: ‡§Ü‡§∞‡•á ‡§Æ‡•á‡§ü‡•ç‡§∞‡•ã ‡§ï‡§æ‡§∞ ‡§∂‡•á‡§° ‡§ï‡•á ‡§ñ‡§ø‡§≤‡§æ‡§´ ‡§™‡•ç‡§∞‡§¶‡§∞‡•ç‡§∂‡§® ‡§Æ‡•á...,‡§∂‡§ø‡§µ‡§∏‡•á‡§®‡§æ ‡§®‡•á‡§§‡§æ ‡§Ü‡§¶‡§ø‡§§‡•ç‡§Ø ‡§†‡§æ‡§ï‡§∞‡•á ‡§®‡•á ‡§ï‡§π‡§æ ‡§ï‡§ø ‡§Ü‡§∞‡•á ‡§ï‡•Ä ‡§≤‡§°‡§º...,,Mumbai: Big thing was #Shinen dut #Auxane lead...,Shivina leader Mutharre said that Arne's war i...,But what about you?,https://www.sudarshannews.in/shivsena-leader-a...
496,Jalamsingh surana,,,‡§ú‡•à‡§∏‡§≤‡§Æ‡•á‡§∞ ‡§ú‡•à‡§∏‡§≤‡§Æ‡•á‡§∞-‡§™‡•ã‡§ï‡§∞‡§£ ‡§≠‡§æ‡§∏‡•ç‡§ï‡§∞ 07-07-2022 ‡§ú‡•à‡§∏‡§≤‡§Æ‡•á...,But what about you?,But what about you?,"Muller Tsermer-Permor, a 07-202 Joller-Permmer...",https://www.facebook.com/photo.php?fbid=403786...
497,Love jihad ‡§≤‡§µ ‡§ú‡§ø‡§π‡§æ‡§¶,,,"‡§ú‡§Æ‡§æ‡§®‡§æ ‡§¨‡§¶‡§≤ ‡§ó‡§Ø‡§æ ‡§π‡•à ‡§Ø‡§æ‡§∞‡•ã‡§Ç, ‡§Ö‡§¨ ‡§ó‡•ç‡§∞‡§æ‡§π‡§ï ‡§≠‡§ó‡§µ‡§æ‡§® ‡§®‡§π‡•Ä‡§Ç, ...",But what about you?,But what about you?,"Guys, men have changed, no longer customer god...",https://www.facebook.com/photo.php?fbid=432360...
498,Updesh rana youth brigade,,‡§Æ‡•à‡§Ç ‡§∂‡§Ç‡§≠‡•Ç ‡§®‡§æ‡§• ‡§∞‡•á‡§ó‡§∞ ‡§ï‡•á ‡§ï‡§æ‡§∞‡•ç‡§Ø ‡§∏‡•á ‡§™‡•ç‡§∞‡§∏‡§®‡•ç‡§® ‡§π‡•Ç‡§Ç ‡§Æ‡•á‡§∞‡§æ...,,But what about you?,"I'm pleased with the work of the N√∂hrer, my ro...",But what about you?,https://www.facebook.com/100005952257352/posts...
499,Kapil Mishra Fans,‡§∏‡§ö‡•ç‡§ö‡§æ‡§à ‡§ï‡§≠‡•Ä ‡§®‡§π‡•Ä ‡§õ‡•Å‡§™ ‡§∏‡§ï‡§§‡•Ä üëáüëáüëá,,Tweet Panchjanya ‡§™‡§æ‡§ú‡•ç‡§µ‡§ú‡§®‡•ç‡§Ø @epanchjanya ‡§¨‡§æ‡§ü‡§≤‡§æ ...,The Truth Can Never Hide,But what about you?,Tweet Panchjanya circum3a @ecudavany Povlller ...,https://www.facebook.com/photo.php?fbid=354206...


Save the translated dataset

In [None]:
output_df.to_csv('/content/drive/MyDrive/translated_CT.csv', index=False)