In [None]:
# Téléchagement

! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 21.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 41.8MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


# Préparation des données

In [None]:
# Chargement des données
import pandas as pd
import numpy as np

df0 = pd.read_csv('https://raw.githubusercontent.com/MARCTELLY/NLP-TER/main/clean_donaldTrump_tweets%20-%20clean_donaldTrump_tweets.csv')
df0 = df0[df0['is_retweet']=="FALSE"]
df0['text'].to_csv('twetts.txt')

In [None]:
# Lecture du fichier texte
text_list = df0[['text']].select_dtypes(exclude=[np.float]).dropna(axis=0)
text = [tweet for tweet in text_list.text]

In [None]:
text[:5]

['71000000 Legal Votes. The most EVER for a sitting President!',
 'THE OBSERVERS WERE NOT ALLOWED INTO THE COUNTING ROOMS. I WON THE ELECTION GOT 71000000 LEGAL VOTES. BAD THINGS HAPPENED WHICH OUR OBSERVERS WERE NOT ALLOWED TO SEE. NEVER HAPPENED BEFORE. MILLIONS OF MAIL-IN BALLOTS WERE SENT TO PEOPLE WHO NEVER ASKED FOR THEM!',
 'I WON THIS ELECTION BY A LOT!',
 'Georgia Counties Using Same Software as Michigan Counties Also Encounter ‘Glitch’  via @BreitbartNews What a total mess this “election” has been!',
 'Lawyer’s Press Conference at Four Season’s Landscaping Philadelphia. Enjoy!']

In [None]:
from sklearn.model_selection import train_test_split

tweet_train, tweet_test = train_test_split(text, test_size=.1)

with open("tweet_test.txt", 'w') as f:
  f.write(str(tweet_text))


with open("tweet_train.txt", 'w') as f:
  f.write(str(tweet_train))

# Modèle

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
train_path = "tweet_train.txt"
test_path = "tweet_test.txt"

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [None]:
from transformers import Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained("gpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-tweet", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

tweet = pipeline('text-generation',model='./gpt2-tweet', tokenizer='gpt2',config={'max_length':250})




In [None]:
tweet("barack obama")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'barack obama is trying to take away our National Anthem", \'Thank you!\', \'Just talked to @BarackObama about his "Biggest Victory" with the African American Community. It is the biggest American achievement yet and it will be'}]

In [None]:
tweet("Democrates are")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Democrates are a disgrace to the Republican Party and the Country', 'A major victory for the @WhiteHouse for the people!', 'Congressman @David_Hagel for his leadership and commitment to @thehill is a hero for our"}]

In [None]:
tweet("Thank to our")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Thank to our beautiful @GOPleaders…', 'RT @TomFitton: “Senator Mark Warner did an amazing job on Fox &amp; Friends last night. Great job on your new book. You are the man for it but you still"}]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import pipeline

tweet = pipeline('text-generation',model='/content/drive/MyDrive/TER/gpt-tweet', tokenizer='gpt2',config={'max_length':128})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [None]:
tweet("Barack Obama and democrats")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Barack Obama and democrats can’t get their way...we’re at a standstill', 'Congressman Mike Crapo (@CumpoGOP) has been an incredible advocate for America and has my Complete and Total End"}]

In [None]:
tweet("Joe")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Joe.  My time is going to come when I’m 100% there and you will WIN!\', \'@fantasticlouis: Best conversation ever with Donald Trump!  He\'ll talk about golf!", \'Thank you for'}]

In [None]:
tweet("sleepy joe")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'sleepy joe - is this what you really think?"  I don\'t have anything else. @realDonaldTrump!", \'If our @FITZ President @realDonaldTrump runs you will win.  Great.\', \'RT @GOPChairwoman: As'}]

In [None]:
tweet("sleepy Joe")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'sleepy Joe Biden? He is also a totally stupid person!\', "Donald Trump\'s approval rating in the RealClearPolitics Poll is at a record low - he\'s the only one who can make it.", \'@_T_Leigh @'}]

In [None]:
tweet("MAKE")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'MAKE AMERICA GREAT AGAIN!,11-22-2014 02:59:11,14,false,36535267713672907\\nTwitter for Android,Just finished my great book "How to Make Life More So" by'}]

In [None]:
tweet("Hillary")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hillary Clinton was also caught asking how many women are in her party?", \'@jakkarlie:  @realDonaldTrump @the_glennfj  He is my mentor I love him and I know he will make this country great again'}]