# Extract Tweets with Snscrape

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

In [29]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('mujer since:2021-07-05 until:2022-07-06').get_items()):
    if i > 150:
        break
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])

In [30]:
# Creating a dataframe to load the list
fema_df = pd.DataFrame(attributes_container,
                         columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [31]:
fema_df.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet
0,luzjttko,2022-07-05 23:59:59+00:00,0,Twitter Web App,Le pasa que esta echo un progre! Y por ser sum...
1,Azotepigs,2022-07-05 23:59:57+00:00,4,Twitter for Android,@maleva_yina @Vitaadara @carlos_sobera Tambi√©n...
2,nwathaniel,2022-07-05 23:59:57+00:00,0,Twitter for Android,soy solo una mujer con 12 pesta√±as de ao3 abie...
3,LaChicaArdilla2,2022-07-05 23:59:54+00:00,17,Twitter for Android,"""Si un hombre golpea a una mujer esta mal. A m..."
4,Pinocchio123456,2022-07-05 23:59:50+00:00,0,Twitter for iPad,@DaniWolv Qu√© belleza de mujer.


In [32]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('hombre since:2021-07-05 until:2022-07-06').get_items()):
    if i > 150:
        break
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])

In [33]:
# Creating a dataframe to load the list
male_df = pd.DataFrame(attributes_container,
                         columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [34]:
male_df.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet
0,AFmedios,2022-07-05 23:59:59+00:00,6,Twitter Web App,#CirculaEnRedes este video de un operativo pol...
1,LaChicaArdilla2,2022-07-05 23:59:54+00:00,17,Twitter for Android,"""Si un hombre golpea a una mujer esta mal. A m..."
2,caraidiente,2022-07-05 23:59:52+00:00,0,Twitter for Android,Vos sos pelotuda. Y si fueses hombre ser√≠as pe...
3,Rubencai99,2022-07-05 23:59:51+00:00,1,Twitter for iPhone,@franeur @carlaysa01 Hombre es que con la resp...
4,Daniellex1012,2022-07-05 23:59:50+00:00,4,Twitter for Android,@atusaludenlinea Hombre Bicentenario\nYo antes...


In [35]:
print(fema_df.shape)
print(male_df.shape)

(151, 5)
(151, 5)


In [36]:
fema_df["gender"] = "female"
male_df["gender"] = "male"

In [37]:
male_df.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,gender
0,AFmedios,2022-07-05 23:59:59+00:00,6,Twitter Web App,#CirculaEnRedes este video de un operativo pol...,male
1,LaChicaArdilla2,2022-07-05 23:59:54+00:00,17,Twitter for Android,"""Si un hombre golpea a una mujer esta mal. A m...",male
2,caraidiente,2022-07-05 23:59:52+00:00,0,Twitter for Android,Vos sos pelotuda. Y si fueses hombre ser√≠as pe...,male
3,Rubencai99,2022-07-05 23:59:51+00:00,1,Twitter for iPhone,@franeur @carlaysa01 Hombre es que con la resp...,male
4,Daniellex1012,2022-07-05 23:59:50+00:00,4,Twitter for Android,@atusaludenlinea Hombre Bicentenario\nYo antes...,male


In [38]:
all_df = pd.concat([fema_df, male_df])
print(all_df.shape)

(302, 6)


## Clean Tweets

In [40]:
import re
import string
import nltk
from nltk .corpus import stopwords
from nltk import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marcos.olguin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
def clean_tweet(text):
    stopWords = set(stopwords.words('spanish'))
    text = str(text).lower()
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)  # remover @
    text = re.sub(r'RT[|\s]', ' ', text)        # remove RTs
    text = re.sub(r'#', ' ', text)              # remove # into the tweet
    text = re.sub(r'https?:\/\/\S+', ' ', text) # remove links

    pattern = r'''(?x)                  # set flag to allow verbose regexps
                (?:[A-Z]\.)+            # abbreviations, e.g. U.S.A
                | \w+(?:-\w+)*          # Words with optional internal hyphens
                | \$?\d+(?:\.\d+)?%?    # Currency and precentages, e.g. $12.40 82%
                | \.\.\.                # Ellipsis
                | [][.,;"'?():-_`]      # These are separate tokens; includes ],[
                '''
    words = nltk.regexp_tokenize(text, pattern)
    re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # Remover signos de puntuacion
    stripped = [re_punc.sub('', w) for w in words]
    #stripped = re.sub(' +', ' ', stripped) # remove multiple spaces
    no_garbage = [w for w in stripped if  w.lower() not in stopWords] # remove stopwords
    no_multiple_spaces = [w for w in no_garbage if  w.lower() not in ' '] # remover multiple spaces
    
    return (" ".join(no_multiple_spaces))

In [43]:
# Del listado de tweets le aplicamos la funcion transform para que nos quede un texto mas limpio
all_df['text_clean'] = all_df['Tweet'].apply(clean_tweet)

In [44]:
all_df.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,gender,text_clean
0,luzjttko,2022-07-05 23:59:59+00:00,0,Twitter Web App,Le pasa que esta echo un progre! Y por ser sum...,female,pasa echo progre ser sumiso mujer psiqui√°trica...
1,Azotepigs,2022-07-05 23:59:57+00:00,4,Twitter for Android,@maleva_yina @Vitaadara @carlos_sobera Tambi√©n...,female,yina sobera real sacado video xenofobo hom√≥fob...
2,nwathaniel,2022-07-05 23:59:57+00:00,0,Twitter for Android,soy solo una mujer con 12 pesta√±as de ao3 abie...,female,solo mujer 12 pesta√±as ao3 abiertas
3,LaChicaArdilla2,2022-07-05 23:59:54+00:00,17,Twitter for Android,"""Si un hombre golpea a una mujer esta mal. A m...",female,si hombre golpea mujer mal menos hombre jud√≠o ...
4,Pinocchio123456,2022-07-05 23:59:50+00:00,0,Twitter for iPad,@DaniWolv Qu√© belleza de mujer.,female,belleza mujer


In [45]:
all_df.to_csv("example_tweet_df.txt", sep="|")

## Another example

In [51]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('super bowl OR marcos -filter:retweets lang:es').get_items()):
    if i > 20:
        break
    attributes_container.append([tweet.user.username, tweet.date, tweet.coordinates, tweet.likeCount, tweet.sourceLabel, tweet.content])

In [58]:
# SI FUNCIONA
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:marcosomtz OR from:nfl lang:es').get_items()):
    if i > 20:
        break
    attributes_container.append([tweet.user.username, tweet.date, tweet.coordinates, tweet.likeCount, tweet.sourceLabel, tweet.content])

In [60]:
# SI FUNCIONA
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('marcos OR leonardo lang:es').get_items()):
    if i > 20:
        break
    attributes_container.append([tweet.user.username, tweet.date, tweet.coordinates, tweet.likeCount, tweet.sourceLabel, tweet.content])

In [61]:
# Creating a dataframe to load the list
tweets_df = pd.DataFrame(attributes_container,
                         columns=["User", "Date Created", "coord", "Number of Likes", "Source of Tweet", "Tweet"])

tweets_df

Unnamed: 0,User,Date Created,coord,Number of Likes,Source of Tweet,Tweet
0,alwaysbadidea,2022-10-21 02:26:00+00:00,,0,Twitter for Android,me pasan el Instagram de Marcos? hoy por m√≠ y ...
1,Marcosxp2020,2022-10-21 02:25:58+00:00,,0,Twitter for Android,@Magherito ü§£üòÇü§£üòÇ Diablos 6 empanadas ü§£üòÇü§£üòÇü§£üòÇü§£üòÇü§£üëç...
2,marcosgood01,2022-10-21 02:25:57+00:00,,0,Twitter for iPhone,mamaste ya encontr√© en insta
3,muleygervasi,2022-10-21 02:25:57+00:00,,0,Twitter Web App,por que el twitter me sale en algun idioma esl...
4,EberDeLaLuz,2022-10-21 02:25:53+00:00,,0,Twitter for Android,"Casi todos votan a Marcos, pero nadie lo dice,..."
5,PibeLeguizamon,2022-10-21 02:25:53+00:00,,0,Twitter for Android,Querido diario: hoy en #GranHernano22 la HOMOF...
6,_goldenbones,2022-10-21 02:25:50+00:00,,0,Twitter Web App,Gente protejamos a Marcos y no lo saquen #GH2022
7,movimientoe_lp,2022-10-21 02:25:48+00:00,,0,Twitter for Android,Con la presencia de nuestro Diputado provincia...
8,QRRLeonardo,2022-10-21 02:25:47+00:00,,0,Twitter for Android,@DeltaMdelta @OAS_official @Politica_LR Lo pri...
9,leonardo_osorto,2022-10-21 02:25:41+00:00,,0,Twitter for Android,Mi excusa para no salir es que no tengo dinero...


In [5]:
tweets_df.Tweet[0]

"@misinning I like that my mind went to 'more excuses for sex while I legitimately struggle with my grades' and you reached the more sensible, Shimazaki-appropriate conclusion of 'simply give me a good grade AND sex'"