### snscrape

- Twitter is also known for being an abundant source of publc text data (perhaps even more so than Reddit).
- For this tutorial, we'll look at using the [snscrape scraper](https://github.com/JustAnotherArchivist/snscrape), which allows us to retreive tweets that contain specific words, phrases, and hashtags.
- In the slides, we talked about how to setup a Twitter App and get a API keys.
    - You should add your own keys below and then run the code block to set your keys:



---



---



# **Installing snscrape:**

In [None]:
!pip3 install snscrape

In [1]:
import os

# **Running snscrape from command line:**

In [None]:
#snscrape --jsonl --progress --max-results 100 --since 2022-01-01 twitter-search "chatgpt filter:verified lang:en until:2023-01-02" > tweets.json

# A simple python code to scrape Twitter using snscrape

In [None]:
os.system('snscrape --jsonl --progress --max-results 100000 --since 2022-01-01 twitter-search "chatgpt filter:verified lang:en until:2023-01-02" > tweets.json')

# **Installing Dependencies**

In [None]:
!pip install -q neattext
!pip install -q textblob==0.17.1

# **Importing Dependencies**

In [None]:
import pandas as pd                           #package for data analysis
import numpy as np                            #package for handling arrays 
import matplotlib.pyplot as plt               #package for data visualizations
import neattext as nt                         #package for text cleaning
import seaborn as sns                         #package for data visualization

# **Reading the Dataset**

In [None]:
df=pd.read_json("tweets.json",lines=True)

# **Feature of the Dataset**

In [None]:
df.columns

In [None]:
df.rename({'renderedContent':'tweet_text'},axis=1,inplace=True)   #renaming feature name renderedContent to tweet_text

# **Data Preprocessing**

In [None]:
def text_preprocessing(text):
  text=nt.fix_contractions(text)     #I'm -> I am
  text=nt.remove_urls(text)          #removing urls
  text=nt.remove_non_ascii(text)     #removing non-ascii characters
  text=nt.remove_userhandles(text)   #removing urserhandles
  text=nt.remove_hashtags(text)      #removing hashtags
  text=nt.remove_multiple_spaces(text)  #removing multiple spaces
  return text
  

In [None]:
df.tweet_text=df.tweet_text.apply(lambda x:text_preprocessing(x))

# **Sentiment Analysis**

In [None]:
from textblob import TextBlob   #special package for short sentence sentiment analysis

In [None]:
def sentiment_polarity(text):
  '''
  this fucntion calculates polarity of each tweet
  '''
  text=TextBlob(text)
  return text.sentiment.polarity

In [None]:
df['sentiment_polarity']=df.tweet_text.apply(lambda x:sentiment_polarity(x))

In [None]:
def sentiment_tag(polarity):
  '''
  this function assigns sentiment tag according to its polarity
  '''
  if polarity>0:
        return 'positive'
  elif polarity<0:
        return 'negative'
  else:
    return 'neutral'

In [None]:
df['sentiment_tag']=df['sentiment_polarity'].apply(lambda x:sentiment_tag(x))

In [None]:
sns.countplot(x=df['sentiment_tag'])
plt.title('Sentiment Distribution')
plt.show()

# **Word Cloud Visualizaiton**

In [None]:
from wordcloud import WordCloud

In [None]:
all_words = ' '.join(df['sentiment_tag'])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# **Emotion Analysis**

In [None]:
!pip install -q transformers 
from transformers import pipeline
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')

In [None]:
df['Emotions']=df['tweet_text'].apply(emotion)

In [None]:
def extract_em(em_dict):
  return em_dict[0]['label'] 

In [None]:
df['Emotions_Label']=df['Emotions'].apply(lambda x:extract_em(x))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x=df['Emotions_Label'])
plt.title('Emotions Distribution')

# **Postprocessing**

In [None]:
df['tweet_text']=df['tweet_text'].apply(lambda x:nt.remove_special_characters(x))
df['tweet_text']=df['tweet_text'].apply(lambda x:nt.remove_numbers(x))
df['tweet_text']=df['tweet_text'].apply(lambda x:nt.remove_stopwords(x))
df['tweet_text']=df['tweet_text'].apply(lambda x:nt.remove_shortwords(x,3))

In [None]:
all_words = ' '.join(df['tweet_text'])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df.columns

# **Tweets by Month**

In [None]:
df.index=df.date

In [None]:
tweets_by_month=df.loc[:,'tweet_text'].resample('m').count()

In [None]:
tweets_by_month.plot()

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x=df['sourceLabel'].value_counts()[0:10].index,y=df['sourceLabel'].value_counts()[0:10].values)
plt.title('Tweet Sources')

In [None]:
corpus=''.join(df['tweet_text'])

In [None]:
#corpus=[word for word in corpus.split()]
corpus=list(corpus.split())

In [None]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary([corpus])
# Create Corpus
texts = [corpus]
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


# **Topic Modelling**

In [None]:
import gensim

In [None]:
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                      num_topics=100)


In [None]:
# Print the Keyword in the 10 topics
lda_model.print_topics()



---



---

