# Preliminary Import

In [2]:
!pip install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.6.2.20230320-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.8/71.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.6.2.20230320


In [3]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# CryptoPunks & Gender & Skin Tone

In [None]:
# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('cryptopunk AND ((female) OR (male) OR(dark) OR (light) OR (medium) OR (albino) OR (alien) OR (ape) OR (zombie)) since:2017-06-23 until:2022-10-31 lang:en min_faves:0 exclude:retweets').get_items()):
  tweets_list.append([tweet.date, tweet.content, tweet.id, tweet.user.username, tweet.hashtags]) #tweet.id

  tweets_list.append([tweet.date, tweet.content, tweet.id, tweet.user.username, tweet.hashtags]) #tweet.id


In [None]:
tweets_list = pd.DataFrame(tweets_list, columns=['Date', 'Text', 'ID', 'Username', 'Hashtags'])

In [None]:
tweets_list.head()

Unnamed: 0,Date,Text,ID,Username,Hashtags
0,2022-10-30 13:45:29+00:00,Gm Gm\n\nCan’t afford an cryptopunk ape? Me ne...,1586715917534494720,HTKYevin,
1,2022-10-30 04:19:42+00:00,Rare Ape CryptoPunk Sells For Almost $4.5 Mill...,1586573536931377152,IsletCrypto,
2,2022-10-29 19:30:06+00:00,The Bored Ape Yacht Club and CryptoPunk floor ...,1586440257989476352,YugaLabsNews_LT,
3,2022-10-29 19:30:06+00:00,The Bored Ape Yacht Club and CryptoPunk floor ...,1586440257070841857,LuckyNFTNews,
4,2022-10-29 17:12:13+00:00,Public sale is Live! @cybotz_nft🚀\n\nTop 10 #N...,1586405557497462786,NftsRankingBot,[NFT]


In [None]:
tweets_list.to_csv("tweets_cryptopunk_gender_skintone_new.csv")

# Import Data

In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('tweets_cryptopunk_gender_skintone.csv')

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download stopwords and stemmer
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define a function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if not word in stop_words]
    
    # Stem words
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into string
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the 'Text' column
df['Preprocessed Text'] = df['Text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Date,Text,ID,Username,Hashtags,Preprocessed Text
5878,5878,2018-01-15 16:28:04+00:00,NEO and STRAT are my portfolio's light in the ...,952940455628627968,C_Cryptopunk,,neo strat portfolio light dark rn lol
5879,5879,2018-01-03 13:54:31+00:00,More big 2018 #CryptoPunk action: New bid of 1...,948553162058854402,cryptopunksnfts,['CryptoPunk'],big cryptopunk action new bid eth k usd alien
5880,5880,2018-01-01 07:53:01+00:00,@officialmcafee And like you I have a dark pas...,947737412771315713,CryptoPunk_X,,officialmcafe like dark past fight better futur
5881,5881,2017-09-21 16:35:40+00:00,#paintapunk progress! #alien INCOMING!!! #ETH ...,910905378946371585,cryptopunkart,"['paintapunk', 'alien', 'ETH', 'cryptopunk', '...",paintapunk progress alien incom eth cryptopunk...
5882,5882,2017-06-29 22:40:42+00:00,A new #CryptoPunk price record has just been s...,880556660447518720,cryptopunksnfts,['CryptoPunk'],new cryptopunk price record set zombi sold eth


In [7]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_tweets_gender_skintone_new.csv', index=False)

In [8]:
# Define a dictionary of keywords and corresponding frequencies
keyword_frequencies = {'female': 0, 'male': 0, 'dark': 0, 'light': 0, 'medium': 0, 'albino': 0, 'alien': 0, 'ape': 0, 'zombie': 0}

# Loop through each keyword and count the frequency of tweets containing that keyword
for keyword in keyword_frequencies.keys():
    keyword_frequencies[keyword] = df['Preprocessed Text'].str.count(keyword).sum()

# Show the resulting frequencies for each keyword
print(keyword_frequencies)


{'female': 5, 'male': 131, 'dark': 131, 'light': 102, 'medium': 33, 'albino': 42, 'alien': 1510, 'ape': 4561, 'zombie': 14}


In [9]:
import plotly.express as px

# Create a dataframe from the keyword_frequencies dictionary
df_freq = pd.DataFrame.from_dict(keyword_frequencies, orient='index', columns=['Frequency'])

# Reset the index to make 'Keywords' a column
df_freq = df_freq.reset_index().rename(columns={'index': 'Keywords'})

# Create the histogram
fig = px.histogram(df_freq, x='Keywords', y='Frequency', nbins=len(keyword_frequencies), color='Keywords',
                   labels={'Keywords': 'Keywords', 'Frequency': 'Frequency'})

# Customize the layout
fig.update_layout(title='Word Frequency of Gender and Skin Tone Keywords in Tweets',
                  xaxis_title='Keywords',
                  yaxis_title='Frequency',
                  bargap=0.1)

# Add the frequency number on the histogram bars
fig.update_traces(texttemplate='%{y}', textposition='outside')

# Change the color scale
fig.update_layout(coloraxis=dict(colorscale='Viridis'))

# Show the plot
fig.show()


# Sentiment Analysis (VADER)

In [10]:
! pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [11]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define a dictionary of keywords and corresponding sentiment scores
keyword_scores = {'female': 0, 'male': 0, 'dark': 0, 'light': 0, 'medium': 0, 'albino': 0, 'alien': 0, 'ape': 0, 'zombie': 0}

# Create an instance of the SentimentIntensityAnalyzer class
analyzer = SentimentIntensityAnalyzer()

# Define a function to apply to each tweet containing any of the keywords
def get_sentiment_score(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores['compound']

# Loop through each keyword and calculate the sentiment score for tweets containing that keyword
for keyword in keyword_scores.keys():
    keyword_df = df[df['Preprocessed Text'].str.contains(keyword)]
    if len(keyword_df) > 0:
        keyword_scores[keyword] = keyword_df['Preprocessed Text'].apply(get_sentiment_score).mean()

# Show the resulting sentiment scores for each keyword
print(keyword_scores)


{'female': 0.12356, 'male': 0.1204487804878049, 'dark': 0.1141646551724138, 'light': 0.18083099999999994, 'medium': 0.208228125, 'albino': 0.0901025, 'alien': 0.07282419475655431, 'ape': 0.08100832859579307, 'zombie': 0.28827692307692304}


In [12]:
import plotly.express as px

# Create a dataframe from the keyword_frequencies dictionary
df_freq = pd.DataFrame.from_dict(keyword_scores, orient='index', columns=['scores'])

# Reset the index to make 'Keywords' a column
df_freq = df_freq.reset_index().rename(columns={'index': 'Keywords'})

# Create the histogram
fig = px.histogram(df_freq, x='Keywords', y='scores', nbins=len(keyword_frequencies), color='Keywords',
                   labels={'Keywords': 'Keywords', 'Scores': 'scores'})

# Customize the layout
fig.update_layout(title='Sentiment Scores for Gender and Skin Tone Keywords',
                  xaxis_title='Keywords',
                  yaxis_title='Sentiment Score',
                  bargap=0.1)

# Add the frequency number on the histogram bars
fig.update_traces(texttemplate='%{y:.3f}', textposition='outside')

# Change the color scale
fig.update_layout(coloraxis=dict(colorscale='Viridis'))

# Show the plot
fig.show()
