# Procedure
1. Data gathering
2. Data assesment and cleaning
3. Data Preprocessing
4. Sentiment Analysis
5. Data Visualization
6. Communications and Insight


Data gathering:

scrape tweets using tweepy and twitters AI

collect:
user id
tweet content
location
likes
retweets

In [None]:
import pandas as pd
import tweepy
import time
import nltk
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt

tweepy.__version__

In [None]:
#Setting up authorization to tweepy API
#Need to fill in with correct keys. Requires an account with elevated access
app_api_key = '*'
app_api_key_secret = '*'
access_token = '*'
access_token_secret = '*'
bearer_token = '*'


auth = tweepy.OAuthHandler(app_api_key, app_api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [None]:
def scrape(search_words, date_since, num_tweets):
  
  #Declare Dataframe
  df = pd.DataFrame(columns = ['username','location', 'time', 'followers', 'text', 'retweets', 'hashtags'])

  #Scrape twitter for tweets
  tweets = tweepy.Cursor(api.search_tweets, q=search_words, lang="en", since_id=date_since, tweet_mode='extended').items(num_tweets)

  #Store the tweets in a python list
  tweet_list = [tweet for tweet in tweets]

  #Extract information and store in dataframe
  for tweet in tweet_list:
    username = tweet.user.screen_name
    location = tweet.user.location
    time = tweet.created_at
    followers = tweet.user.followers_count
    retweets = tweet.retweet_count
    hashtags = tweet.entities['hashtags']

    #Change hashtags into strings
    hashtext = ''
    for j in range(0, len(hashtags)):
      hashtext += ' ' + hashtags[j]['text']

    #Check if it's a retweet before getting the text body
    try:
      text = tweet.retweeted_status.full_text
    except AttributeError:
      text = tweet.full_text

    #Add to dataframe
    ith_tweet = [username, location, time, followers, text, retweets, hashtext]
    df.loc[len(df)] = ith_tweet
  
  return df



In [None]:
date_since = "2022-10-04"
num_tweets = 500
sleep_timer = 240 #4 minute sleep timer after each set of tweet requests, necessary to avoid overloading tweepy and getting kicked out

#General consensus of game
df_overwatch = pd.DataFrame(columns = ['username','location', 'time', 'followers', 'text', 'retweets', 'hashtags'])


df_overwatch = pd.concat([df_overwatch, scrape("#Overwatch OR #Overwatch2 OR #PlayOverwatch", date_since, num_tweets)], ignore_index=True)
time.sleep(sleep_timer) 


#Making a dictionay of dataframes to holda data for each character
char_dict = {}

char_lst = ['Ana', 'Ashe', 'Baptiste', 'Bastion', 'Brigitte', 'Cassidy', 'D.Va', 'Doomfist', 'Echo',
              'Genji', 'Hanzo', 'Junker_Queen', 'Junkrat', 'Lucio', 'Mei', 'Mercy', 'Moira', 'Orisa',
              'Pharah', 'Rammatra', 'Reaper', 'Reinhardt', 'Roadhog', 'Sigma', 'Sojurn', 'Soldier_76', 
              'Sombra', 'Symmetra', 'Torbjorn', 'Tracer', 'Widowmaker', 'Winston', 'Wrecking_Ball',
              'Zarya', 'Zenyatta']
for i in char_lst:
  char_dict[i] = scrape(i + " AND Overwatch", date_since, num_tweets)
  time.sleep(sleep_timer) 


Data Assessment and Cleaning:

1. Remove hashtags

In [None]:
#Remove hashtags from text
df_overwatch['text'] = df_overwatch['text'].str.replace('#', '', regex=True)

for key in char_dict:
  char_dict[key]['text'] = char_dict[key]['text'].str.replace('#', '', regex=True)
  print(char_dict[key]['text'])

Data preprocessing:

1. Tokenize
2. Remove stop words
3. Lemmatization
4. Remove anything that isn't a noun, adjective, or verb (NAV)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
def make_nav(df):
  lemmatizer = WordNetLemmatizer()

  Lemmas = []
  df['Nav'] = ''
  lemma_text_token = []

  for i in range(len(df)):
    text_tokens = nltk.word_tokenize(df.at[i, 'text'])
    lemmatized_description = ' '.join([lemmatizer.lemmatize(w) for w in text_tokens])

    #Lemmatize
    lemma_text_token = nltk.word_tokenize(lemmatized_description)


    ans = nltk.pos_tag(lemma_text_token)
    print(ans)
    temp = ''

    #Removing stop words: only nouns, adjectives, pronouns, adverbs remain
    for j in range(len(ans)):
      val = ans[j][1]

      if(val == 'NN' or val == 'NNS' or val == 'NNP' or val == 'NNPS' #Nouns
        or val == 'JJ' or val == 'JJR' or val =='JJS' #Adjectives
        or val == 'PRP' or val =='PRP$' #Pronouns
        or val == 'RB' or val == 'RBR' or val == 'RBS' #Adverbs
        or val == 'VB' or val == 'VBG' or val =='VBD' or val == 'VBN'):
        temp += ans[j][0] + ' '
      df['Nav'][i] = temp



In [None]:
#Get rid of anything that isn't a noun, adjective, or verb
make_nav(df_overwatch)
for key in char_dict:
  make_nav(char_dict[key])

More data cleaning:

1. Make lowercase
2. Remove extra symbols
3. Replace dashes

In [None]:
#Make lowercase
df_overwatch['Nav'] = df_overwatch['Nav'].str.lower()

#Remove urls, @ symbols, contraction tails
df_overwatch['Nav'] = df_overwatch['Nav'].str.replace('//\S+|http|@|\'S+', '')

#Replace dashes and underscores with spaces
df_overwatch['Nav'] = df_overwatch['Nav'].str.replace('-|_', ' ')


In [None]:
for key in char_dict:
  char_dict[key]['Nav'] = char_dict[key]['Nav'].str.lower() #Lowercase
  char_dict[key]['Nav'] = char_dict[key]['Nav'].str.replace('//\S+|http|@|\'S+', '') #remove URLS, @, contractions
  char_dict[key]['Nav'] = char_dict[key]['Nav'].str.replace('-|_', ' ') #replace -, _

Sentiment analysis/opinion mining:

1. Use textBlob to get sentiment analysis
2. Check to see overall game sentiment
3. Create scale on most liked vs least liked character

In [None]:
#Returns the average polarity and subjectivity of a dataframe column
#Polarity range: -1 to 1
#Subjectivity range: 0 to 1
def get_avg_polar_and_subjec(df_column):
  polarity_total = 0
  subjectivity_total = 0

  for text in df_column:
    data = TextBlob(text) #tuple with polarity in [0] and subjectivity in [1]
    polarity_total += data.sentiment[0]
    subjectivity_total += data.sentiment[1]
  
  polarity_average = polarity_total / len(df_column)
  subjectivity_average = subjectivity_total / len(df_column)

  return polarity_average, subjectivity_average

In [None]:
game_polarity_average, game_subjectivity_average = get_avg_polar_and_subjec(df_overwatch['Nav'])

In [None]:
#Creating a dataframe to hold subjectivity and polarity
df_pol_sub = pd.DataFrame(columns = ['name','polarity', 'subjectivity'])

for key in char_dict:
  temp_pol_avg, temp_sub_avg = get_avg_polar_and_subjec(char_dict[key]['Nav'])
  df_pol_sub.loc[len(df_pol_sub)] = [key, temp_pol_avg, temp_sub_avg]

Data Visualization

In [None]:
#Overall game stats
print('Overall Game Polarity: ' + str(game_polarity_average))
print('Overall Game Subjectivity: ' + str(game_subjectivity_average))  

In [None]:
#Bar graph shows how much each character is liked
labels = df_pol_sub['name']
values = df_pol_sub['polarity']

plt.figure(figsize=(38, 10))

plt.bar(labels, values)

plt.xlabel('Name')
plt.ylabel('Polarity')
plt.show()

In [None]:
#Top 5 most liked characters
print("Top 5 Most Liked Characters:")
liked_char = df_pol_sub.sort_values(by=['polarity']).tail(5)['name']
liked_char = liked_char.iloc[::-1]
print(liked_char.to_string(index=False))

In [None]:
#Top 5 least liked characters
print("Top 5 Least Liked Characters:")
liked_char = df_pol_sub.sort_values(by=['polarity']).head(5)['name']
print(liked_char.to_string(index=False))

In [None]:
#Bar graph shows how subjective the opinion on each character is
labels = df_pol_sub['name']
values = df_pol_sub['subjectivity']

plt.figure(figsize=(38, 10))

plt.bar(labels, values)

plt.xlabel('Name')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
#Relationship between polarity and subjectivity
x = df_pol_sub['polarity']
y = df_pol_sub['subjectivity']

plt.scatter(x, y)

plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()