In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Comment this lines if you have this stuff already installed
#!(yes |pip install geopandas)
#!(yes |pip install descartes)
#!(yes |conda install -c conda-forge geoplot)
#!(yes | pip install plotly)
import plotly.express as px

%config IPCompleter.greedy=True
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Getting the data for location-keyword relationshp analysis
twitterCleanData = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')
twitterKeywordAndLocation = twitterCleanData[['keyword', 'location']]
#Filtering those values that are missing
twitterKeywordAndLocation = twitterKeywordAndLocation[twitterKeywordAndLocation['keyword'] != 'unknown']
twitterKeywordAndLocation = twitterKeywordAndLocation[twitterKeywordAndLocation['location'] != 'unknown']
twitterKeywordAndLocation.head()

In [None]:
#Some of DataFrame's properties
twitterKeywordAndLocation.info()

In [None]:
#Changing DataFrame columns data types in order to apply some operations on them 

twitterKeywordAndLocation['keyword'] = twitterKeywordAndLocation['keyword'].astype('string')
twitterKeywordAndLocation['location'] = twitterKeywordAndLocation['location'].astype('string')

In [None]:
#Visualization

#Counter of keywords
twitterKeywordAndLocation['counter'] = 1
twitterKeywordAndLocation

In [None]:
#Number of keywords per location
keywordsPerLocation = twitterKeywordAndLocation.groupby('location')\
.agg({'counter' : 'sum'})
keywordsPerLocation = keywordsPerLocation.reset_index()
keywordsPerLocation

In [None]:
#Cleaning the data
#Dropping those locations that don't have a significant number of keywords
#For that, first we see the average
keywordMean = keywordsPerLocation['counter'].mean()
keywordMean = int(keywordMean)
keywordMean

In [None]:
#Then, we filter
keywordsPerLocation = keywordsPerLocation[keywordsPerLocation['counter'] > keywordMean]
keywordsPerLocation.describe()

In [None]:
top20KeywordsPerLocation = keywordsPerLocation.nlargest(20, 'counter')
top20KeywordsPerLocation

In [None]:
#barplot

ax = sns.barplot(x = 'location', y = 'counter', data = top20KeywordsPerLocation,\
                palette = sns.cubehelix_palette(20, reverse = True))
ax.set_ylabel('Keyword counter', size = 16)
ax.set_xlabel('Location', size = 16)
ax.set_title('Top 20 locations with most number of keywords', size = 20)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)
ax.figure.set_size_inches(16, 6)
plt.tight_layout()
ax.get_figure().savefig("Top20LocationsWithMosthKeywords.png")

In [None]:
#Most popular keywords
keywordsPopular = twitterKeywordAndLocation.groupby('keyword')\
.agg({'counter' : 'sum'})
keywordsPopular = keywordsPopular.reset_index()

In [None]:
#Removing non representative samples
keywordsPopularMean = keywordsPopular.mean()
keywordsPopularMean = int(keywordsPopularMean)
keywordsPopular = keywordsPopular[keywordsPopular['counter'] > keywordsPopularMean]
keywordsPopular.describe()

In [None]:
top20KeywordsPopular = keywordsPopular.nlargest(20, 'counter')

In [None]:
#barplot

ax = sns.barplot(x = 'keyword', y = 'counter', data = top20KeywordsPopular,\
                palette = sns.cubehelix_palette(30, start=.5, rot = -.75, reverse = True))
ax.set_ylabel('Occurrence of keywords in different tweets', size = 12)
ax.set_xlabel('Keyword', size = 16)
ax.set_title('Top 20 most popular keywords', size = 20)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)
ax.figure.set_size_inches(16, 6)
plt.tight_layout()
ax.get_figure().savefig("Top20MostPopularKeywords.png", optimize = True)

In [None]:
#Comparison between the most popular keywords and locations associated to those keywords
locationAndKeyword = pd.merge(twitterKeywordAndLocation, keywordsPopular, on = 'keyword')
locationAndKeyword.head()

In [None]:
locationAndKeyword.drop(['counter_x', 'counter_y'], axis = 1, inplace = True)
locationAndKeyword['counter'] = 1
locationAndKeyword.head()

In [None]:
locationAndKeyword = locationAndKeyword.groupby(['keyword', 'location'])\
.agg({'counter' : 'sum'})
locationAndKeyword = locationAndKeyword.sort_values(by = 'counter', ascending = False)
locationAndKeyword = locationAndKeyword.reset_index()
locationAndKeyword

In [None]:
#scatterplot

g = sns.relplot(x = 'keyword', y = 'location', hue = 'counter',\
            s = 150, alpha = .5, height = 5, data = locationAndKeyword.head(20),\
               palette = "winter_r")

g.ax.set_title('Locations per popular keyword', fontsize = 20)
g.set_xlabels('Keyword',fontsize = 18)
g.set_ylabels('Location', fontsize = 18)
g.ax.set_xticklabels(g.ax.get_xticklabels(), rotation = 80)
g.ax.figure.set_size_inches(10, 6)
plt.tight_layout()
g.ax.get_figure().savefig("LocationPeroPopularKeyword.png")

In [None]:
#Starting the analisis for the relation between keywords and veracuty
#Getting the data 
twitterCleanData = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')
twitterKeywordAndTarget = twitterCleanData[['keyword', 'target']]
#Filtering those values that are missing
twitterKeywordAndTarget = twitterKeywordAndTarget[twitterKeywordAndTarget['keyword'] != 'unknown']
twitterKeywordAndTarget['keyword'] = twitterKeywordAndTarget['keyword'].astype('string')
twitterKeywordAndTarget.head()

In [None]:
#Veracity per keyword
veracityPerKeyword = twitterKeywordAndTarget.groupby('keyword')\
.agg({'target' : ['sum', 'count']})
veracityPerKeyword.columns = ['target_count','target_sum']
veracityPerKeyword.head()

In [None]:
#Removing non representative samples
veracityPerKeywordMean = veracityPerKeyword[('target_count')].mean()
veracityPerKeywordMean = int(veracityPerKeywordMean)
veracityPerKeyword = veracityPerKeyword[veracityPerKeyword[('target_count')] > veracityPerKeywordMean]
veracityPerKeyword.describe()

In [None]:
veracityPerKeyword['veracity'] = (veracityPerKeyword['target_count'] / veracityPerKeyword['target_sum']) * 100
veracityPerKeyword.drop(columns = ['target_count', 'target_sum'], inplace=True)
veracityPerKeyword.head()

In [None]:
veracityPerKeyword = veracityPerKeyword.reset_index()

In [None]:
top10KeywordsInRealTweets = veracityPerKeyword.nlargest(10, 'veracity')

In [None]:
#barplot

ax = sns.barplot(x = 'keyword', y = 'veracity', data = top10KeywordsInRealTweets,\
                palette = sns.cubehelix_palette(10,  rot = -.75, reverse = True))
ax.set_ylabel('Veracity percentage', size = 12)
ax.set_xlabel('Keyword', size = 16)
ax.set_title('Top 10 keywords within tweets with the highest veracity level', size = 20)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)
ax.figure.set_size_inches(16, 6)
plt.tight_layout()
ax.get_figure().savefig("Top10keywordstweetshighestveracity.png", optimize = True)

In [None]:
top10KeywordsInFalseTweets = veracityPerKeyword.nsmallest(10, 'veracity')

In [None]:
#barplot

ax = sns.barplot(x = 'keyword', y = 'veracity', data = top10KeywordsInFalseTweets,\
                palette = sns.cubehelix_palette(10, start=.1, rot = .55, reverse = True))
ax.set_ylabel('Veracity percentage', size = 12)
ax.set_xlabel('Keyword', size = 16)
ax.set_title('Top 10 keywords within tweets with the lowest veracity level', size = 20)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)
ax.figure.set_size_inches(16, 6)
plt.tight_layout()
ax.get_figure().savefig("Top10keywordstweetslowestveracity.png", optimize = True)

In [None]:
tweetsInfo = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv', usecols = ['text', 'target'])
tweetsInfo.head()

In [None]:
#Starting the analisis for the relation between keywords and hashtags
hashForKeywordsAndHashtags = {}
csvFormatted = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv', usecols = ['keyword', 'text', 'target'])
csvFormatted = csvFormatted[csvFormatted['keyword'] != 'unknown']
csvFormatted['keyword'].value_counts().head(20)

In [None]:
def sumHashtagIfNedeed(line, keyword, hashOfKeywords):
    for word in line.split():
        if not word.startswith('#'):
            continue
        word = word.lower().lstrip('#')
        if keyword not in hashOfKeywords:
            hashOfKeywords[keyword] = {}
        hashOfKeywords[keyword][word.lstrip('#')] = hashOfKeywords[keyword].get(word.lstrip('#'), 0) + 1

In [None]:
csvFormatted.apply(lambda x: sumHashtagIfNedeed(x['text'], x['keyword'], hashForKeywordsAndHashtags), axis = 1)

In [None]:
d = {'keyword': [], 'amount': []}
for key in hashForKeywordsAndHashtags:
    d['keyword'].append(key)
    d['amount'].append(sum(hashForKeywordsAndHashtags[key].values()))
keywordDf = pd.DataFrame(d, columns =['keyword', 'amount'])
keywordDf = keywordDf.sort_values(by = ['amount']).tail(20)
keywordDf

In [None]:
csvWithOnlyKeywordTarget = csvFormatted.drop('text', 1)
csvWithOnlyKeywordTarget = csvWithOnlyKeywordTarget.groupby(['keyword']).agg({'target': ['mean', 'count']})
csvWithOnlyKeywordTarget.columns = csvWithOnlyKeywordTarget.columns.get_level_values(0) + '_' + csvWithOnlyKeywordTarget.columns.get_level_values(1)
csvWithOnlyKeywordTarget = csvWithOnlyKeywordTarget.sort_values(by = ['target_mean']).reset_index() #Hasta aca tengo TODOS los valores de verdad
csvWithOnlyKeywordTarget = pd.merge(csvWithOnlyKeywordTarget, keywordDf, on='keyword', how='inner')
csvWithOnlyKeywordTarget.head(20)

In [None]:
tweetsInfo['tweet_length'] = tweetsInfo.text.str.len()

In [None]:
def validUser(userName):
    if '@' in userName:
        user = getter(userName, '@')
        length = len(user)
        if (length > 1 and length <= 16):
            for char in user[1:]:
                if not(char.isalnum() or char == '_'): return False
            return True
    return False

In [None]:
def validLink(link):
    type1 = 'https://'
    type2 = 'http://'
    if type1 in link and len(link) > 9: return True
    if type2 in link and len(link) > 8: return True
    return False

In [None]:
def validHashtag(hashtag):
    if '#' in hashtag:
        hashtag = getter(hashtag, '#')
        hashtag = hashtag[1:]
        return hashtag.isalnum()
    return False

In [None]:
#Function to analyze the elements (#. @, links) of the tweet
def analyzeTweets(text):
    result = [0,0,0] #number of usersTagged, hashtags and links
    text = text.split()
    for word in text:
        if validUser(word): result[0] += 1
        elif validHashtag(word): result[1] += 1
        elif validLink(word): result[2] += 1
    return result

In [None]:
#This function creates a new DF, char = # or @
#dicc is a dictionary, key: @user or #hashtag, value: [number of occurrence, number of true targets]
#func1 get the hashtag or user correctly
#func2 cheks if the result of func1 is correct
#text its a combination of two columns, text and target, the target is in the last position always
def dataFrameMaker(text, dicc, char, func1, func2):
    text = text.split()
    target = int(text[-1])
    for word in text:
        if char in word:
            auxString = func1(word, char)  #auxString could be a @user or a #hashtag
            if func2(auxString):
                auxString = auxString.lower()
                auxList = dicc[auxString] = dicc.get(auxString, [0,0])
                auxList[0] += 1
                auxList[1] += target

In [None]:
#Col1: column of the DF to filter
#condition: condition to filter
#col2: must be of type str
#col3: its type will be transform into str
#return a Serie with the combination of col2 and col3
def colsCombination(col1, condition, col2, col3):
        filterCondition = tweetsInfo[col1] > condition
        strCol2 = tweetsInfo[filterCondition][col2]
        strCol3 = tweetsInfo[filterCondition][col3]
        strCol3 = strCol3.astype(str)
        result = strCol2 + ' ' + strCol3
        return result

In [None]:
#Gets the hashtag or user
def getter(text, char):
    pos = text.find(char)
    text = text[pos:]
    #Some users or hashtags finish with : or .
    if text.endswith(':') or text.endswith('.'):
        text = text[:-1]
    return text

In [None]:
#Col1: column of the DF to filter
#condition: condition to filter
#col2: must be of type str
#col3: its type will be transform into str
#return a Serie with the combination of col2 and col3
def colsCombination(col1, condition, col2, col3):
        filterCondition = tweetsInfo[col1] > condition
        strCol2 = tweetsInfo[filterCondition][col2]
        strCol3 = tweetsInfo[filterCondition][col3]
        strCol3 = strCol3.astype(str)
        result = strCol2 + ' ' + strCol3
        return result

In [None]:
def getInfo(dataList, pos):
    return dataList[pos]

In [None]:
#Aux column to get the result
tweetsInfo['aux_column'] = tweetsInfo.text.apply(analyzeTweets)

tweetsInfo['users_tagged'] = tweetsInfo.aux_column.apply(getInfo,args=(0,))
tweetsInfo['hashtags'] = tweetsInfo.aux_column.apply(getInfo,args=(1,))
tweetsInfo['links'] = tweetsInfo.aux_column.apply(getInfo,args=(2,))

del tweetsInfo['aux_column']
tweetsInfo.head()

In [None]:
hashtagFrame = tweetsInfo[['tweet_length','hashtags']]
hashtagFrame['tweet_element'] = 'hashtag'

linksFrame = tweetsInfo[['tweet_length','links']]
linksFrame['tweet_element'] = 'link'

usersFrame = tweetsInfo[['tweet_length','users_tagged']]
usersFrame['tweet_element'] = 'user_tagged'

hashtagFrame.rename(columns={'hashtags':'Amount'},inplace=True)
linksFrame.rename(columns={'links':'Amount'},inplace=True)
usersFrame.rename(columns={'users_tagged':'Amount'},inplace=True)

appendedElements = hashtagFrame.append(linksFrame)
appendedElements = appendedElements.append(usersFrame)

groupedElements = appendedElements.groupby(['tweet_element','Amount']).agg({'tweet_length':['mean','count']})
labels0 = groupedElements.columns.get_level_values(0)
labels1 = groupedElements.columns.get_level_values(1)
groupedElements.columns = labels0 + '_' + labels1
groupedElements.reset_index(inplace=True)
groupedElements.rename(columns={'tweet_length_count':'occurrence', 'tweet_element':'Tweet element',\
                               'tweet_length_mean':'Average tweet length'}, inplace=True)
groupedElements

plot = sns.lmplot(x="Average tweet length", y="Amount", col="Tweet element", hue="Tweet element", data=groupedElements, col_wrap=2, ci=None, palette="muted", height=4,\
         scatter_kws={"s": 50, "alpha": 1},legend = True)

# add annotations one by one with a loop
auxCont = 0
auxDicc = {0:'hashtag',1:'link',2:'user_tagged'}
for ax in plot.axes:
    element = auxDicc[auxCont]
    for line in range(0, groupedElements.shape[0]):
            ax.set_yticks([0,2,4,6,8,10,12,14])
            if groupedElements['Tweet element'][line] == element:
                ax.text(groupedElements['Average tweet length'][line]+0.30, groupedElements.Amount[line], groupedElements.occurrence[line],\
                horizontalalignment='left', size='small', color='black', weight='semibold')
    auxCont += 1

In [None]:
#Links boxplot
colors = {0: 'mistyrose', 1: 'salmon', 2: 'indianred', 3: 'firebrick'}
linksData = tweetsInfo[['tweet_length','links']][tweetsInfo.links <= 3]
ax = sns.boxplot(x = 'links', y = 'tweet_length', data = linksData, palette = colors)
ax.set_title('Use of links according to length of tweets',fontsize = 16)
ax.set_ylabel('Tweet length (amount of characters)', fontsize = 14)
ax.set_xlabel('Number of links per tweet', fontsize = 14);

In [None]:
#Users tagged boxplot
usersTaggedData = tweetsInfo[['tweet_length','users_tagged']][tweetsInfo.users_tagged < 5]
ax = sns.boxplot(x = 'users_tagged', y = 'tweet_length', data = usersTaggedData)
ax.set_title('Use of tags according to length of tweets',fontsize = 16)
ax.set_ylabel('Tweet length (amount of characters)', fontsize = 14)
ax.set_xlabel('Number of tags per tweet', fontsize = 14);

In [None]:
#Building the usersTagged df
usersDicc = {}
tweetsInfoTags = colsCombination('users_tagged',0,'text','target')
tweetsInfoTags.apply(dataFrameMaker, args = (usersDicc,'@',getter,validUser))

In [None]:
usersSerie = pd.Series(usersDicc)
usersDataFrame = usersSerie.to_frame(name='auxCol')
usersDataFrame['occurrence'] = usersDataFrame.auxCol.apply(getInfo,args=(0,))
usersDataFrame['target_sum'] = usersDataFrame.auxCol.apply(getInfo,args=(1,))
del usersDataFrame['auxCol']
usersDataFrame.head()

In [None]:
#Top 10 mentioned users barplot
topMentions = usersDataFrame[usersDataFrame.occurrence > 5]
topMentionUsers = topMentions.occurrence.nlargest(10).index

ax = sns.barplot(x=topMentionUsers, y = topMentions.loc[topMentionUsers,'occurrence'],color='sandybrown',label='All mentions')
sns.barplot(x=topMentionUsers, y = topMentions.loc[topMentionUsers,'target_sum'], color='darkorange',label='True tweets')

ax.set_title('Top 10: Mentioned users', fontsize=20)
ax.set_xlabel('Users', fontsize = 18)
ax.set_ylabel('Total mentions', fontsize = 18)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.tick_params(axis="both", labelsize=16)
ax.legend(ncol=2, loc='best', frameon=True);
ax.figure.set_size_inches(12, 6);

In [None]:
#Building the hashtags df
hashtagsDicc = {}
tweetsInfoHashtags = colsCombination('hashtags',0,'text','target')
tweetsInfoHashtags.apply(dataFrameMaker, args = (hashtagsDicc,'#',getter,validHashtag))

In [None]:
hashtagsSerie = pd.Series(hashtagsDicc)
hashtagsDataFrame = hashtagsSerie.to_frame(name='auxCol')
hashtagsDataFrame['occurrence'] = hashtagsDataFrame.auxCol.apply(getInfo, args=(0,))
hashtagsDataFrame['target_sum'] = hashtagsDataFrame.auxCol.apply(getInfo, args=(1,))
del hashtagsDataFrame['auxCol']
hashtagsDataFrame.head()

In [None]:
#Trending topics barplot
trendingTopics = hashtagsDataFrame[hashtagsDataFrame.occurrence > 5]
trendingTopicHashtags = trendingTopics.occurrence.nlargest(10).index

ax = sns.barplot(x=trendingTopicHashtags, y = trendingTopics.loc[trendingTopicHashtags,'occurrence'],color='sandybrown',label='Hashtag occurrence')
sns.barplot(x=trendingTopicHashtags, y = trendingTopics.loc[trendingTopicHashtags,'target_sum'], color='darkorange',label='True tweets')

ax.set_title('Trending topics', fontsize=20)
ax.set_xlabel('Hashtags', fontsize = 18)
ax.set_ylabel('Total mentions', fontsize = 18)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.tick_params(axis="both", labelsize=16)
ax.legend(ncol=2, loc='best', frameon=True)
ax.figure.set_size_inches(12, 6);

In [None]:
grouped = tweetsInfo.groupby('tweet_length').agg({'target':'sum','text':'count','hashtags':'sum','users_tagged':'sum','links':'sum'})
grouped['total_elements'] = grouped.links + grouped.hashtags + grouped.users_tagged
grouped['truth_percentage'] = (grouped.target / grouped.text) * 100
grouped.index.rename('lengths', inplace = True)

In [None]:
max_quantity, min_quantity = grouped.text.max(), grouped.text.min()
max_quantity, min_quantity

In [None]:
grouped.drop(grouped[grouped.text <= 10].index, inplace=True)
grouped.reset_index(inplace = True)
grouped.head()

In [None]:
#Regplot 1
ax = sns.regplot(x='lengths', y='truth_percentage', data=grouped,\
                line_kws = {'color':'lightsalmon','alpha':0.5,'lw':3},\
                color = 'brown')

ax.set_xlabel('Tweet lengths(amount of characters)', fontsize = 14)
ax.set_ylabel('Percentage of veracity (%)', fontsize = 14)
ax.set_yticks(np.arange(0,110,10))
ax.set_title('Tweet length vs veracity', fontsize=16)
ax.figure.set_size_inches(14,4);

In [None]:
def tweetSize(tweetLength, minValue, intervalRange):
    if tweetLength < (minValue + intervalRange): return 'small'
    if (minValue + intervalRange) <= tweetLength and tweetLength < (minValue + 2 * intervalRange): return 'medium'
    return 'large'

In [None]:
#Parallel coordinates to show the characteristics of the tweets
#First we get the range of each interval
minValue = tweetsInfo.tweet_length.min()
maxValue = tweetsInfo.tweet_length.max()
intervalRange = (maxValue - minValue) // 3

#We add a new column
tweetsInfo['tweet_size'] = tweetsInfo.tweet_length.apply(tweetSize, args = (minValue, intervalRange))
tweetsInfo.head(5)

In [None]:
groupedSize = tweetsInfo.groupby(['tweet_size', 'tweet_length']).agg({'target':'sum','users_tagged':'sum','links':'sum','hashtags':'sum', 'text':'count'})
groupedSize.reset_index(inplace=True)
groupedSize['truth_percentage'] = (groupedSize.target / groupedSize.text) * 100
groupedSize = groupedSize[groupedSize.text >= 10]

In [None]:
#For the plot we need a numerical value to plot the lines in different colors
def tweetSizeID(tweet):
    if tweet == 'small': return 1
    if tweet == 'medium': return 2
    return 3

groupedSize['tweet_size_id'] = groupedSize.tweet_size.apply(tweetSizeID)

In [None]:
#Parallel coordinates
fig = px.parallel_coordinates(groupedSize, color= 'tweet_size_id',\
                              dimensions=['tweet_size_id','tweet_length','links' ,'hashtags',\
                                          'users_tagged','truth_percentage'],\
                             labels = {'tweet_length':'Tweet length','links':'Links sum','hashtags':'Hashtags sum',\
                                      'users_tagged':'Users tagged sum','truth_percentage':'Truth perentage',\
                                      'tweet_size_id':'Tweet size ID'})

fig.update_layout(coloraxis_showscale=False)
fig.update_layout(title={'text': 'Characteristics of the tweets according to their length','y':1.,'x':0.5})
fig.show()

In [None]:
#Natural disasters
df1 = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv', usecols=['id','keyword','location'])
df2 = tweetsInfo
disastersDF = pd.concat([df1,df2], axis = 1)
disastersDF.head()

In [None]:
#Some auxiliar functions
def getSeriesElements(serie, setElements):
    for element in serie.values: #Element is a string always
        if '/' in element:
            element = element.split('/')
            for elemt in element: setElements.add(elemt.lower())
                
        else: setElements.add(element.lower())

In [None]:
#the info is between position 2 and 6, both included
def obtainInfo(infoList):
    naturalDisasters = {} #Key: group, value: {subgroups}
    for i in range (2,7): #To iterate the info in the list
        dataFrame = infoList[i]
        #Always delete the first row, it dosent have info
        dataFrame.drop(0, inplace = True)
        #The group always is at (0,1)
        group = dataFrame.iloc[0,1]
        #Now its time to iterate the columns of the DF
        cols = len(dataFrame.columns)
        subgroups = set()
        for col in range(2, cols):
            serie = dataFrame[col] #This is a serie
            serie.dropna(inplace=True)
            serie.drop_duplicates(inplace=True)
            getSeriesElements(serie, subgroups)
        naturalDisasters[group] = subgroups
    return naturalDisasters

In [None]:
#Reading the info about natural disasters
#naturalDisastersDicc key: group value: set of subgroups
dataPage = pd.read_html('https://www.emdat.be/classification')
naturalDisastersDicc = obtainInfo(dataPage)

In [None]:
#Adding missing items
geo = naturalDisastersDicc['Geophysical']
geo.update({'volcano', 'sinkhole', 'lava'})

met = naturalDisastersDicc['Meteorological']
met.update({'hurricane','typhoon','twister','cyclone','hailstorm',\
            'violent storm','rainstorm','sandstorm','snowstorm','windstorm'})
met -= {'lightning','derecho','sand','wind'}

hydro = naturalDisastersDicc['Hydrological']
hydro.update({'debris','mudslide','avalanche','rockfall'})
hydro.remove('avalanche (snow, debris, mudflow, rockfall)')

clima = naturalDisastersDicc['Climatological']
clima.update({'bush fire', 'land fire', 'brush fire'})
clima.remove('land fire: brush, bush,  pasture')

In [None]:
#Creating a new set with the union of all the subgroups
allNaturalDisasters = set()
for value in naturalDisastersDicc.values():
    allNaturalDisasters = allNaturalDisasters.union(value)

In [None]:
#Some keywords are about natural disasters but they are in plural
#we fix that with this function
def fixingKeywords(keyword):
    auxDictionary = {'floods':'flood', 'wild fires': 'wildfire', 'forest fires':'forest fire',\
                    'bush fires':'bush fire'}
    return auxDictionary.get(keyword, keyword)

In [None]:
disastersDF.keyword = disastersDF.keyword.apply(fixingKeywords)

In [None]:
condition = ~(disastersDF.keyword.isin(allNaturalDisasters))
naturalDisastersDF = disastersDF.drop(disastersDF[condition].index)
naturalDisastersDF.head()

In [None]:
#group by subgroup of natural disaster
natDisastGrouped = naturalDisastersDF.groupby('keyword').agg({'tweet_length':['max','min','mean'],\
                                                             'text':'count','target':'sum',\
                                                             'users_tagged':'sum','hashtags':'sum','links':'sum'})
natDisastGrouped.head()

In [None]:
#Changing the labels
labels0 = natDisastGrouped.columns.get_level_values(0)
labels1 = natDisastGrouped.columns.get_level_values(1)
natDisastGrouped.columns = labels0 + '_' + labels1
natDisastGrouped.head()

In [None]:
#Function to obtain the group of a keyword
def naturalDisasterGroup(keyword):
    for key, value in naturalDisastersDicc.items():
        if keyword in value: return key

In [None]:
natDisastGrouped.reset_index(inplace=True)
#Adding the column 'group', to the data frama
natDisastGrouped['group'] = natDisastGrouped.keyword.apply(naturalDisasterGroup)

In [None]:
natDisastGrouped.shape

In [None]:
natDisastGrouped.rename(columns = {'keyword':'subgroup'},inplace=True)
natDisastGrouped.sort_values(by='group',inplace=True)  #easy to order, has 30 rows

In [None]:
natDisastGrouped.set_index(['group','subgroup'],inplace=True)
natDisastGrouped.head()

In [None]:
#Adding the column 'truth_percentage' = (target_sum / text_count) * 100
natDisastGrouped['truth_percentage'] = (natDisastGrouped.target_sum / natDisastGrouped.text_count) * 100
natDisastGrouped

In [None]:
#Veracity of the subgroups
subVeracity = natDisastGrouped.reset_index().sort_values(by='truth_percentage',ascending=False)
ax = sns.barplot(x = 'truth_percentage', y = subVeracity.subgroup, data = subVeracity);
ax.set_title('Natural disasters subgroups: veracity', fontsize=20)
ax.set_xlabel('Percentage of veracity(%)', fontsize = 18)
ax.set_ylabel('Natural Disasters subgroups', fontsize = 18)
ax.tick_params(axis="x", labelsize='large')
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)
ax.set_xticks(np.arange(0,110,10))
ax.figure.set_size_inches(10, 8);

In [None]:
#Parallel coordinates
from pandas.plotting import parallel_coordinates
df = natDisastGrouped.reset_index().sort_values(by='truth_percentage',ascending=False)[:5].rename(columns={'truth_percentage':'Truth percentage', 'text_count':'Text count',\
                                                                                                           'target_sum':'Target sum','links_sum':'Links sum',\
                                                                                                           'users_tagged_sum':'Users tagged sum','hashtags_sum':'Hashtags sum'})
lineColors = ('firebrick','cadetblue','orange','forestgreen','magenta')

ax = parallel_coordinates(df, 'subgroup', cols = ['Truth percentage', 'Text count','Target sum','Links sum', 'Users tagged sum','Hashtags sum'],\
                          color = lineColors, lw = 5.0)
ax.set_title('Top 5 subgroups: characteristics', fontsize= 16)
ax.figure.set_size_inches(16, 8)
ax.legend(loc='best');

In [None]:
#Analysis of kind of hashtags used in tweets based on tweet's veracity

In [None]:
train = tweetsInfo

In [None]:
tweetsInfo = twitterCleanData[['text', 'target']]
tweetsInfo.head()

In [None]:
#Receives a text
#Returns a list containing all valid hashtags on the text
#A hashtag is valid if it only contains alphanumeric values
def getValidHashtags(text, char):
    resultingHashtags = []
    text = text.split()
    for word in text:
        hashtag = getter(word, char)
        if validHashtag(hashtag) == True:
                resultingHashtags.append(hashtag)
    return resultingHashtags

In [None]:
hashtagsDataFrame = hashtagsDataFrame.reset_index()
hashtagsDataFrame = hashtagsDataFrame.rename(columns = {'index' : 'hashtag'})
hashtagsDataFrame.head()

In [None]:
textPerVeracity = tweetsInfo.groupby('target').agg({'text' : 'sum'})

In [None]:
#Returns a DF with hashtags included in tweets of veracity 'target', their occurrence and target_sum
#target = 0 -> false tweets
#target = 1 -> real tweets
def hashtagPerVeracityDFMaker(target, char):
    df = pd.DataFrame()
    df['hashtag'] = getValidHashtags(textPerVeracity.loc[target,'text'], char)
    df = hashtagsDataFrame.merge(df, on = 'hashtag')
    df = df.drop_duplicates()
    return df

In [None]:
#Creating DF with hashtags and the veracity of the tweets containing them
#Hashtags in false tweets:
DFHashtagPerFalseTweets = hashtagPerVeracityDFMaker(0, '#')
DFHashtagPerFalseTweets['occurrence'] = DFHashtagPerFalseTweets['occurrence'] - DFHashtagPerFalseTweets['target_sum']
del DFHashtagPerFalseTweets['target_sum']
top10HashtagPerFalseTweets = DFHashtagPerFalseTweets.nlargest(10, columns = 'occurrence')
top10HashtagPerFalseTweets.head(3)

In [None]:
#Some statistics
DFHashtagPerFalseTweets.describe()

In [None]:
#Bar plot
ax = sns.barplot(x = 'hashtag', y = 'occurrence', data = top10HashtagPerFalseTweets,\
                 palette = sns.color_palette("Reds_r", 10))
ax.set_xlabel('Hashtags', fontsize = 15)
ax.set_ylabel('Occurrence', fontsize = 15)
ax.set_title('Top 10 hashtags in false tweets', fontsize = 20)
plt.xticks(rotation = 65, horizontalalignment = 'right')
ax.figure.set_size_inches(15, 6)
plt.tight_layout()
ax.get_figure().savefig("Top10HashtagsInFalseTweets.png")

In [None]:
#Hashtags in real tweets:
DFHashtagPerRealTweets = hashtagPerVeracityDFMaker(1, '#')
DFHashtagPerRealTweets['occurrence'] = DFHashtagPerRealTweets['target_sum']
del DFHashtagPerRealTweets['target_sum']
top10HashtagPerRealTweets = DFHashtagPerRealTweets.nlargest(10, 'occurrence')
top10HashtagPerRealTweets.head(3)

In [None]:
#Some statistics
DFHashtagPerRealTweets.describe()

In [None]:
#Bar plot
ax = sns.barplot(x = 'hashtag', y = 'occurrence', data = top10HashtagPerRealTweets,\
                 palette = sns.color_palette("Greens_r", 10))
ax.set_xlabel('Hashtags', fontsize = 15)
ax.set_ylabel('Occurrence', fontsize = 15)
ax.set_title('Top 10 hashtags in real tweets', fontsize = 20)
plt.xticks(rotation = 65, horizontalalignment = 'right')
ax.figure.set_size_inches(15, 6)
plt.tight_layout()
ax.get_figure().savefig("Top10HashtagsInRealTweets.png")

In [None]:
#Comparison between hashtags that appear both in real and false tweets
hashtagsPerVeracity = DFHashtagPerFalseTweets.merge(DFHashtagPerRealTweets, on = 'hashtag')
hashtagsPerVeracity.head(3)

In [None]:
hashtagsPerVeracity['total occurrence'] = hashtagsPerVeracity['occurrence_x'] + hashtagsPerVeracity['occurrence_y']
hashtagsPerVeracity = hashtagsPerVeracity.rename(columns = {'occurrence_y' : 'occurrence real tweets'})
del hashtagsPerVeracity['occurrence_x']
top10HashtagsPerVeracity = hashtagsPerVeracity.nlargest(20, 'total occurrence')

In [None]:
hashtagsPerVeracity.describe()

In [None]:
#Barplot
f, ax = plt.subplots(figsize = (15, 8))
sns.barplot(x = 'total occurrence', y = 'hashtag', data = top10HashtagsPerVeracity,\
            label = 'Total hashtag occurrence', color = 'indigo', edgecolor = 'w')
sns.barplot(x = 'occurrence real tweets', y = 'hashtag', data = top10HashtagsPerVeracity,
            label = 'Real tweets hashtag occurrence', color = 'lightgreen', edgecolor = 'w')
ax.legend(ncol = 2, loc = 'lower right')
ax.set_xlabel('Ocurrence', fontsize = 16)
ax.set_ylabel('Hashtag', fontsize = 16)
ax.set_title('Top 10 most used hashtags and their relationship with veracity', fontsize = 20)
plt.savefig("Top10HashtagsAndTheirVeracity.png")
plt.show()

In [None]:
locations = pd.read_csv('./ToChangeKeywordsAndLocations/worldcities.csv', encoding = 'latin-1')
tweets = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')

In [None]:
cities = {}
countries = {}
cityExceptions = {'London':'United Kingdom','Glasgow':'United Kingdom', 'Birmingham': 'United Kingdom', 'Rome':'Italy','Delhi':'India',\
                 'Paris':'France', 'Moscow':'Russia', 'Geneva':'Switzerland', 'Melbourne':'Australia','Manchester':'United Kingdom','Leicester':'United Kingdom'}
states = {}
def applyCriteria(row):
    if cities.get(row['city'], False) or (cityExceptions.get(row['city'], False) and cityExceptions.get(row['city']) != row['country']):
        return row
    cities[row['city'].lower().strip('.').rstrip()] = [(row['lat'], row['lng']),row['iso3']]
    countries[row['country'].lower().strip('.').rstrip()] = row['iso3']
    if ((row['capital'] == 'admin' or row['capital'] == 'primary') and isinstance(row['admin_name'], str)):
        states[row['admin_name'].lower().strip('.').rstrip()] = [(row['lat'], row['lng']),row['iso3']]
locations.apply(applyCriteria, axis = 1)

In [None]:
changePlace = {'ny': 'new york', 'la': 'los angeles', 'ca': 'california', 'tx': 'texas', 'us':'usa', 'nc': 'north carolina'}
def addNewData(row):
    place = row['location']
    if place in changePlace:
        place = changePlace[place]
    if place in countries:
        row['country'] = countries[place]
    elif place in states:
        row['country'] = states[place][1]
        row['lat'] = states[place][0][0]
        row['long'] = states[place][0][1]
    elif place in cities:
        row['country'] = cities[place][1]
        row['lat'] = cities[place][0][0]
        row['long'] = cities[place][0][1]
    return row
tweets = tweets.apply(addNewData, axis = 1, result_type= 'expand')
tweets.tail(5)

In [None]:
#Geo analysis
from shapely.geometry import Point
import geopandas as gpd

frames = [disastersDF, tweets[['country','lat','long']]]
disastersWorldDF = pd.concat(frames, axis = 1)

condition = ~(disastersWorldDF.keyword.isin(allNaturalDisasters))
natDisastWorldDF = disastersWorldDF.drop(disastersWorldDF[condition].index)#Creating a DF with the natural disasters only

#Droping rows with NaNs
natDisastWorldDF.dropna(inplace = True)

#Creating a new column with the coordinates
natDisastWorldDF['coordinates'] = list(zip(natDisastWorldDF['long'],natDisastWorldDF['lat']))
natDisastWorldDF['coordinates'] = natDisastWorldDF['coordinates'].apply(Point)
natDisastWorldDF.head()

In [None]:
#Some iso codes are integers
def fixISOCode(dataFrame):
    dataFrame.loc[43, 'iso_a3'] = 'FRA'
    dataFrame.loc[21, 'iso_a3'] = 'NOR'
    dataFrame.loc[174, 'iso_a3'] = 'RKS'

In [None]:
trueNatDisast = natDisastWorldDF[natDisastWorldDF.target == 1]
falseNatDisast = natDisastWorldDF[natDisastWorldDF.target == 0]

#creating a geopandas data frame
trueNatDisast = gpd.GeoDataFrame(trueNatDisast, geometry='coordinates')
falseNatDisast = gpd.GeoDataFrame(falseNatDisast, geometry='coordinates')

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) #World map
world = world[world.name != 'Antarctica']
fixISOCode(world)
ax = world.plot(color='lightgrey',edgecolor='black', figsize = (18,10)) #Setting colors

graf = trueNatDisast.plot(ax=ax, legend=True, marker='o', color= 'lime', markersize = 45)
graf = falseNatDisast.plot(ax=ax, legend=True, marker='x', color='red', markersize = 50)
graf.axes.set_title('Tweets about natural disasters over the world', fontsize = 18)
graf.legend(['True','False'], title = 'Tweet Veracity');

In [None]:
#Truth percentage per country
disastersWorldDF.dropna(subset = ['country'], inplace = True)
groupedCountry = disastersWorldDF.groupby(['country']).agg({'target':'sum','text':'count','hashtags':'sum','users_tagged':'sum','links':'sum'})
groupedCountry['truth_percentage'] = (groupedCountry.target / groupedCountry.text) * 100

#groupedCountry.rename(columns = {'target':'target_count', 'text':'text_count'}, inplace = True)
groupedCountry.reset_index(inplace=True)
groupedCountry.rename(columns = {'country':'iso_a3', 'target':'target_count', 'text':'text_count'}, inplace=True)

groupedCountry = groupedCountry[groupedCountry.text_count >= 5]

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world = world[world.name != 'Antarctica']
fixISOCode(world)
ax = world.plot(column = 'name',color='lightgrey',edgecolor='black', figsize = (18,10))
world = world.merge(groupedCountry) #Merging the data frame so we have the info

#Plot
graf = world.plot(ax = ax, column='truth_percentage', legend = True, cmap='Greens',\
                 legend_kwds={'label': 'Percentage of veracity(%)', 'orientation': 'horizontal','extend':'both','extendrect':True})
graf.axes.set_title('Percentage of veracity of tweets by country', fontsize = 18);

In [None]:
#USA geo analysis
#Function to find which state a point belongs to
def findState(coordinate, statesDF):
    iterable = statesDF.values #List of list [state, polygon]
    for stateInfo in iterable:
        if coordinate.within(stateInfo[1]):
            return stateInfo[0]
    return np.nan

In [None]:
import geoplot as gplt
usaData = disastersWorldDF[disastersWorldDF.country == 'USA'].dropna()
usaData['coordinates'] = list(zip(usaData['long'],usaData['lat']))
usaData['coordinates'] = usaData['coordinates'].apply(Point)

contiguousUsa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) #USA map with contiguous states
usaData['state'] = usaData.coordinates.apply(findState, args = (contiguousUsa[['state','geometry']],))
usaData.dropna(inplace=True)

#Grouping per state
statesGrouped = usaData.groupby('state').agg({'text':'count', 'target':'sum', 'users_tagged':'sum', 'hashtags':'sum', 'links':'sum',\
                                             'tweet_length':['max','min','mean']})

#Renaming the labels
labels0 = statesGrouped.columns.get_level_values(0)
labels1 = statesGrouped.columns.get_level_values(1)
statesGrouped.columns = labels0 + '_' + labels1
statesGrouped['truth_percentage'] = (statesGrouped.target_sum / statesGrouped.text_count) * 100
statesGrouped = statesGrouped[statesGrouped.text_count >= 5]
statesGrouped.reset_index(inplace=True)
statesGrouped.head(5)

## Machine learning

In [None]:
#Imports
from numpy import linalg as LA
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.integrate import odeint
from sklearn.model_selection import train_test_split
#import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import tree

In [None]:
def validLocation(location):
    return int(location != 'unknown')

In [None]:
def sizes(size):
    if size == 'small': return 0
    if size == 'medium': return 5
    return 10

In [None]:
disastersDF['Valid_location'] = disastersDF.location.apply(validLocation)
disastersDF['tweet_size'] = disastersDF.tweet_size.apply(sizes)
disastersDF.head()

In [None]:
disastersDF.target.value_counts()

In [None]:
disastersDF.shape

In [None]:
trueTweetsPercentage = (3271 * 100) / 7613
falseTweetsPercentage = 100 - trueTweetsPercentage
trueTweetsPercentage, falseTweetsPercentage

In [None]:
X, y = disastersDF.iloc[:,5:], disastersDF.iloc[:,4]  #X tiene que tener todos los features distintos al target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
plt.bar(X_train.columns, rf_model.feature_importances_)
plt.xlabel('Features')
plt.ylabel('Importancia')
plt.title('Importancia Features con RF')
plt.xticks(rotation = 90, horizontalalignment = 'right')
plt.show()

## Word2vec

In [None]:
#!pip install gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim import models

sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],['this', 'is', 'the', 'second', 'sentence'],['yet', 'another', 'sentence'],['one', 'more', 'sentence'],['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
print(model['sentence'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

In [None]:
sentence = [['hello', 'whats', 'up']]
model = Word2Vec(sentence, min_count = 1)
model

In [None]:
tweetsText = disastersDF.text.to_list()
sentences = [text.split() for text in tweetsText]
model = Word2Vec(sentences, min_count = 25)
print(model)
words = list(model.wv.vocab)
print(words)

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

In [None]:
trueTweets = disastersDF[disastersDF.target == 1]
falseTweets = disastersDF[disastersDF.target == 0]

In [None]:
falseTexts = falseTweets.text.to_list()
trueTexts = trueTweets.text.to_list()
falseSentences = [text.split() for text in falseTexts]
trueSentences = [text.split() for text in trueTexts]
falseModel = Word2Vec(falseSentences, min_count = 25)
trueModel = Word2Vec(trueSentences, min_count = 25)
print(falseModel)
words = list(falseModel.wv.vocab)
print(words)

In [None]:
print(trueModel)
words = list(trueModel.wv.vocab)
print(words)

In [None]:
X = trueModel[trueModel.wv.vocab]
Y = falseModel[falseModel.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
result2 = pca.fit_transform(Y)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1], c = 'g')
pyplot.scatter(result2[:, 0], result2[:, 1], c = 'r');

In [None]:
disastersDF['Total_elements'] = disastersDF.hashtags + disastersDF.users_tagged + disastersDF.links
disastersDF['links_hash'] = disastersDF.hashtags + disastersDF.links
disastersDF['links_users'] = disastersDF.users_tagged + disastersDF.links
disastersDF['hash_users'] = disastersDF.hashtags + disastersDF.users_tagged
train = disastersDF.iloc[:, 4:]
train

In [None]:
X, y = train.iloc[:,1:], train.iloc[:,0]  #X tiene que tener todos los features distintos al target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
rf_model = RandomForestRegressor(random_state=15, n_estimators=1)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
plt.bar(X_train.columns, rf_model.feature_importances_)
plt.xlabel('Features')
plt.ylabel('Importancia')
plt.title('Importancia Features con RF')
plt.xticks(rotation = 90, horizontalalignment = 'right')
plt.show()

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split = 10)
y_pred = clf.fit(X_train, y_train).predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

In [None]:
falseTexts = falseTweets.text.to_list()
trueTexts = trueTweets.text.to_list()
falseSentences = [text.split() for text in falseTexts]
trueSentences = [text.split() for text in trueTexts]
falseModel = Word2Vec(falseSentences, min_count = 1)
trueModel = Word2Vec(trueSentences, min_count = 1)
print(falseModel)
words = list(falseModel.wv.vocab)
print(words)

In [None]:
words = list(trueModel.wv.vocab)
print(words)

In [None]:
'fucking' in trueModel.wv.vocab #Cosas porno, saludos, xoxo

In [None]:
'fucking' in falseModel.wv.vocab

In [None]:
def roughWords(words):
    roughWords = {'sex','sexy', 'cunt', 'dick', 'cock', 'xxx', 'porn',\
                 'lesbian', 'gay', 'masturbation', 'fap', 'asshole',\
                 'assholes', 'suck', 'sucker', 'idiot', 'stupid', 'cum',\
                 'blowjob', 'bitch', 'slut', 'sluts', 'whores', 'bitches', 'whore',\
                 'cunts', 'suckers', 'ass', 'butt', 'nude', 'nudes', 'naked', 'fucking',\
                 'xoxo', 'cocks', 'dicks', 'wtf', 'lol', 'lmfao', 'lmao', 'cunts', 'jerkface'}
    words = words.split()
    for word in words:
        word = word.lower()
        if word in roughWords:
            return 1
        if word.count('?') > 1:
            return 1
    return 0

In [None]:
disastersDF['Rough_words'] = disastersDF.text.apply(roughWords)
disastersDF.Rough_words.value_counts()

In [None]:
train = disastersDF.iloc[:, 4:]
X, y = train.iloc[:,1:], train.iloc[:,0]  #X tiene que tener todos los features distintos al target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.4, train_size = 0.6, random_state=123)

In [None]:
rf_model = RandomForestRegressor(random_state=5, n_estimators=1)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
plt.bar(X_train.columns, rf_model.feature_importances_)
plt.xlabel('Features')
plt.ylabel('Importancia')
plt.title('Importancia Features con RF')
plt.xticks(rotation = 90, horizontalalignment = 'right')
plt.show()

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split = 10)
y_pred = clf.fit(X_train, y_train).predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

In [None]:
#Dependiendo del split que haga para el train mejora la prediccion

In [None]:
y_train.value_counts()

## Parte posta

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
#nltk.download('wordnet')
#nltk.download('punkt')
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
import spacy
import en_core_web_sm


In [None]:
#Features a agregar: ver si mencionan (no en forma de etiqueta) a empresas, personas, ciudades, paises
#O sea usar NER
#Aplicar todo lo escrito salvo la parte de lower para no confundir al algoritmo de NER
#Una vez que aplicas todo eso y sacas los features ahi pasas a lo del vocabulario, word2vec, etc

In [None]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

ne_tree = nltk.ne_chunk(pos_tag(nltk.word_tokenize(ex)))


In [None]:
nlp = en_core_web_sm.load() #Hay que instalarlo, ver link

doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

In [None]:
def analizeText():
    pass

In [None]:
text=" Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

In [None]:
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
tokenizer.tokenize(text) #Splitea por puntuacion (.)

In [None]:
text=nltk.word_tokenize("PierreVinken, 59 years old, will join as a nonexecutive director on Nov. 29 .")
text

In [None]:
import re
import string
text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]

tokenized_docs=[nltk.word_tokenize(doc) for doc in text]
x=re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = x.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
words=["Don't", 'hesitate','to','ask','questions']
[word for word in words if word.lower() not in stops]

In [None]:
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'I\'m', 'I am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        return s

In [None]:
replacer= RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")

#replacer.replace("She must've gone to the market but she didn't go")

In [None]:
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        
        if repl_word != word:
            return self.replace(repl_word)

        return repl_word

In [None]:
wordnet.langs()

In [None]:
replacer=RepeatReplacer()


In [None]:
#Bueno hasta aca tenes todas las herramientas, time to replace the text

In [None]:
def cleanTweet(tweet): #Modificado para sacar solo los links
    return ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())

In [None]:
import string
def deletePunctuation(tokenizedText):
    x = re.compile('[%s]' % re.escape(string.punctuation))
    tokenized_text_no_punctuation = []
    for token in tokenizedText: #Agarro las palabras de la lista
        newToken = x.sub(u'', token)
        if not newToken == u'':
            tokenized_text_no_punctuation.append(newToken)
    return tokenized_text_no_punctuation

In [None]:
def deleteStopwords(tokenizedText, stopwords):
    return [word for word in tokenizedText if word not in stopwords]

In [None]:
def editText(text, stopwords, replacer, repeatReplacer):
    #Primero elimino los links
    text = cleanTweet(text)
    
    #Paso a lower el text
    text = text.lower()
    
    #Reemplazo los I'm por I am
    text = replacer.replace(text)
    
    #Elimino los caracteres repetidos, ej: ohhh por oh
    words = text.split()
    text = ' '.join(repeatReplacer.replace(word) for word in words)
    
    #Tokenizo el texto
    tokenizedText = nltk.word_tokenize(text)
    
    #Elimno los signos de puntuacion
    tokenizedText = deletePunctuation(tokenizedText)
    
    #Elimino los stopwords
    tokenizedText = deleteStopwords(tokenizedText, stopwords)
    
    editText = ' '.join(tokenizedText)
    return editText

In [None]:
repeatReplacer = RepeatReplacer()
replacer = RegexpReplacer()
stop = set(stopwords.words('english'))

disastersDF['text'] = disastersDF.text.apply(editText, args = (stop, replacer, repeatReplacer))
disastersDF

In [None]:
disastersDF.loc[0].text

In [None]:
naturalDisastersDF.loc[0].text

In [None]:
stopsasdasdasd

## Regresión Logística

https://www.aprendemachinelearning.com/regresion-logistica-con-python-paso-a-paso/

In [None]:
#import libraries
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
bestFeatures = ['168',
 '70',
 '119',
 '296',
 '198',
 '66',
 '227',
 '16',
 '68',
 '183',
 '256',
 '95',
 '54',
 '114',
 '36',
 '149',
 '293',
 '216',
 '50',
 '234',
 '127',
 '184',
 '156',
 '298',
 '189',
 '77',
 '165',
 '87',
 '135',
 '158',
 '34',
 '6',
 '110',
 '151',
 '154',
 '74',
 '137',
 '155',
 '166',
 '237',
 '231',
 '211',
 '277',
 '81',
 '27',
 '94',
 '92',
 '254',
 '241',
 '239',
 '285',
 '57',
 'amount_of_words_proportion',
 '191',
 '31',
 '163',
 '142',
 '104',
 '21',
 '262',
 '291',
 'tweet_length',
 '208',
 '1',
 '25',
 '116',
 '230',
 '152',
 'Total_elements',
 '182',
 '121',
 '7',
 '41',
 '80',
 'links',
 '171',
 '55',
 '28',
 '282',
 '150',
 'Natural_disaster',
 '103',
 '272',
 '69',
 '214',
 '280',
 '258',
 '130',
 '120',
 '249',
 '52',
 '247',
 '270',
 '238',
 '260',
 '43',
 '228',
 '86',
 '264',
 '200',
 '111',
 '157',
 '212',
 '4',
 '159',
 '51',
 '30',
 '12',
 '2',
 '39',
 '179',
 '278',
 '284',
 '84',
 '14',
 '186',
 '125',
 '63',
 '117',
 '273',
 '220',
 '287',
 '153',
 '99',
 '78',
 '265',
 '288',
 '267',
 '180',
 '29',
 '102',
 '139',
 '131',
 '274',
 '98',
 '38',
 '173',
 '62',
 '10',
 '160',
 '259',
 '164',
 '82',
 '206',
 '0',
 '275',
 '181',
 '204',
 '13',
 '118',
 '133',
 '93',
 '33',
 '129',
 '207',
 '266',
 '48',
 '172',
 '290',
 '40',
 '148',
 '185',
 '271',
 '85',
 '268',
 '146',
 '56',
 '217',
 '101',
 '91',
 '64',
 '23',
 '187',
 '32',
 '195',
 '140',
 '124',
 '177',
 '141',
 '276',
 '128',
 '219',
 '47',
 '245',
 '108',
 '261',
 '46',
 '88',
 '162',
 '294',
 '37',
 '235',
 '123',
 '35',
 '178',
 '58',
 '174',
 '76',
 '202',
 '109',
 '295',
 '205',
 '225',
 '136',
 '255',
 '281',
 '213',
 '242',
 '190',
 '229',
 '232',
 '233',
 '17',
 '167',
 '122',
 '65',
 '223',
 '26',
 '236',
 '297',
 '203',
 '289',
 '126',
 '222',
 '218',
 '253',
 '246',
 '90',
 '286',
 '113',
 '292',
 '176',
 '45',
 '73',
 '263',
 '106',
 '3',
 '221',
 '196',
 '115',
 '161',
 '145',
 '251',
 '147',
 '107',
 '252',
 '96',
 '61',
 '112',
 '59',
 '192',
 '283',
 '210',
 '132',
 '44',
 '209',
 '279',
 '22',
 '188',
 '226',
 '42',
 '11',
 '79',
 '143',
 '75',
 '53',
 '193',
 '169',
 '248']

In [None]:
trainCSV = pd.read_csv('./forHiper')

In [None]:
x = trainCSV.loc[:, top_feat]
y = trainCSV.pop('target')

In [None]:
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(x, y, test_size=0.2, random_state=7)

In [None]:
model = linear_model.LogisticRegression()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))

## Gaussian process classification

https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpc.html

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [None]:
# Specify Gaussian Processes with fixed and optimized hyperparameters
gpc = GaussianProcessClassifier(kernel=1.0 * RBF(1.0),  n_jobs = -1).fit(X_train, Y_train)

print("Accuracy: %.3f (initial) %.3f"
      % (accuracy_score(Y_train, gpc.predict(X_train)))) 

In [2]:
## Neural Networks with Tensorflow

In [39]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.layers import Dropout

In [14]:
bestFeatures = ['168',
 '70',
 '119',
 '296',
 '198',
 '66',
 '227',
 '16',
 '68',
 '183',
 '256',
 '95',
 '54',
 '114',
 '36',
 '149',
 '293',
 '216',
 '50',
 '234',
 '127',
 '184',
 '156',
 '298',
 '189',
 '77',
 '165',
 '87',
 '135',
 '158',
 '34',
 '6',
 '110',
 '151',
 '154',
 '74',
 '137',
 '155',
 '166',
 '237',
 '231',
 '211',
 '277',
 '81',
 '27',
 '94',
 '92',
 '254',
 '241',
 '239',
 '285',
 '57',
 'amount_of_words_proportion',
 '191',
 '31',
 '163',
 '142',
 '104',
 '21',
 '262',
 '291',
 'tweet_length',
 '208',
 '1',
 '25',
 '116',
 '230',
 '152',
 'Total_elements',
 '182',
 '121',
 '7',
 '41',
 '80',
 'links',
 '171',
 '55',
 '28',
 '282',
 '150',
 'Natural_disaster',
 '103',
 '272',
 '69',
 '214',
 '280',
 '258',
 '130',
 '120',
 '249',
 '52',
 '247',
 '270',
 '238',
 '260',
 '43',
 '228',
 '86',
 '264',
 '200',
 '111',
 '157',
 '212',
 '4',
 '159',
 '51',
 '30',
 '12',
 '2',
 '39',
 '179',
 '278',
 '284',
 '84',
 '14',
 '186',
 '125',
 '63',
 '117',
 '273',
 '220',
 '287',
 '153',
 '99',
 '78',
 '265',
 '288',
 '267',
 '180',
 '29',
 '102',
 '139',
 '131',
 '274',
 '98',
 '38',
 '173',
 '62',
 '10',
 '160',
 '259',
 '164',
 '82',
 '206',
 '0',
 '275',
 '181',
 '204',
 '13',
 '118',
 '133',
 '93',
 '33',
 '129',
 '207',
 '266',
 '48',
 '172',
 '290',
 '40',
 '148',
 '185',
 '271',
 '85',
 '268',
 '146',
 '56',
 '217',
 '101',
 '91',
 '64',
 '23',
 '187',
 '32',
 '195',
 '140',
 '124',
 '177',
 '141',
 '276',
 '128',
 '219',
 '47',
 '245',
 '108',
 '261',
 '46',
 '88',
 '162',
 '294',
 '37',
 '235',
 '123',
 '35',
 '178',
 '58',
 '174',
 '76',
 '202',
 '109',
 '295',
 '205',
 '225',
 '136',
 '255',
 '281',
 '213',
 '242',
 '190',
 '229',
 '232',
 '233',
 '17',
 '167',
 '122',
 '65',
 '223',
 '26',
 '236',
 '297',
 '203',
 '289',
 '126',
 '222',
 '218',
 '253',
 '246',
 '90',
 '286',
 '113',
 '292',
 '176',
 '45',
 '73',
 '263',
 '106',
 '3',
 '221',
 '196',
 '115',
 '161',
 '145',
 '251',
 '147',
 '107',
 '252',
 '96',
 '61',
 '112',
 '59',
 '192',
 '283',
 '210',
 '132',
 '44',
 '209',
 '279',
 '22',
 '188',
 '226',
 '42',
 '11',
 '79',
 '143',
 '75',
 '53',
 '193',
 '169',
 '248']

In [124]:
trainCSV = pd.read_csv('./forHiper')

In [125]:
# x = trainCSV.loc[:, bestFeatures]
x = trainCSV.drop(['id', 'text', 'keyword', 'location','target'], axis=1)
y = trainCSV.pop('target')

In [126]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.1, random_state=42)

In [94]:
X_train

Unnamed: 0,users_tagged,hashtags,links,tweet_length,tweet_size,Valid_location,Total_elements,links_hash,links_users,hash_users,...,291,292,293,294,295,296,297,298,299,keywordAppearance
4996,0,1,1,140,10,1,2,2,1,1,...,0.361572,-0.425049,1.316772,-1.566833,-0.566895,-0.620117,-0.859009,-0.622070,0.263733,34
3263,2,1,0,98,5,0,3,1,2,3,...,0.340088,-0.977493,0.245422,-0.150024,-0.044067,-0.655029,-0.664185,-0.630859,0.565063,36
4907,1,0,1,143,10,0,2,1,2,1,...,-0.100586,-1.276062,0.506836,-0.496307,0.138550,-0.776489,-0.514954,0.574730,0.110199,36
2855,0,0,1,118,10,0,1,1,1,0,...,0.595459,0.007629,0.329071,1.373535,-0.173340,-0.844604,0.178711,1.080078,-0.392517,35
4716,3,1,0,82,5,1,4,1,3,4,...,0.117432,-0.892090,0.065308,-0.734375,-0.068359,-0.593689,-0.187317,-0.264648,0.775635,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,1,0,0,96,5,0,1,0,1,1,...,0.395630,-0.271973,0.481812,-0.196777,0.027946,0.325317,-0.498291,0.054932,-0.027115,29
5390,0,0,0,132,10,0,0,0,0,0,...,0.561401,-0.717865,0.884277,-0.738220,-0.066711,-0.205322,-0.435562,0.825165,-0.367126,37
860,0,0,2,121,10,0,2,2,2,0,...,0.221887,-0.102051,0.195831,0.305969,0.518250,0.338684,-0.046021,-0.126175,0.011230,35
7603,0,0,1,136,10,0,1,1,1,0,...,0.219482,-0.115479,0.176392,0.558594,0.356445,-1.221680,-0.206444,0.983330,0.339111,0


In [None]:
def build_model(optimizer):
  model = Sequential()
  model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])
  return model

In [41]:
parameters = parameters = {'batch_size': [8,16,32],
             'epochs':[100,200,500],
             'optimizer': ['adadelta', 'rmsprop', 'adam']}

In [42]:
estimator = KerasClassifier(build_fn=build_model, verbose=0)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'batch_size': 8, 'epochs': 500, 'optimizer': 'adadelta'}

In [134]:
#Capas
def build_model(l1, l2):
  model = Sequential()
  model.add(Dense(l1, input_shape=(X_train.shape[1],), activation='relu'))
  # model.add(Dropout(0.1))
  model.add(Dense(l2, activation='relu'))
  # model.add(Dropout(0.1))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
  return model

parameters = parameters = {'l1':[16,32,64,128,256],
                           'l2':[16,32,64,128]}

estimator = KerasClassifier(build_fn=build_model, verbose=0, batch_size=16, epochs=100)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'l1': 256, 'l2': 128}

In [136]:
#Dropouts
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import GridSearchCV
def build_model(d1, d2):
    model = Sequential()
    model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(Dropout(d1))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(d2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['acc'])
    return model

parameters = parameters = {'d1':[0.01,0.1,0.2,0.25],
                            'd2':[0.01,0.1,0.2,0.25]}

estimator = KerasClassifier(build_fn=build_model, verbose=0, batch_size=16, epochs=200)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'d1': 0.01, 'd2': 0.01}

In [155]:
# (64, 16), (64, 32), (128, 16) va bien
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.01))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])

In [156]:
model.summary()

Model: "sequential_843"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2519 (Dense)           (None, 256)               93952     
_________________________________________________________________
dropout_1241 (Dropout)       (None, 256)               0         
_________________________________________________________________
dense_2520 (Dense)           (None, 128)               32896     
_________________________________________________________________
dropout_1242 (Dropout)       (None, 128)               0         
_________________________________________________________________
dense_2521 (Dense)           (None, 1)                 129       
Total params: 126,977
Trainable params: 126,977
Non-trainable params: 0
_________________________________________________________________


In [157]:
model.fit(X_train, y_train, epochs=500, batch_size=20)

step - loss: 0.5045 - accuracy: 0.7656
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376

<tensorflow.python.keras.callbacks.History at 0x1a6a7610d0>

In [158]:
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

test loss, test acc: [0.42617130279541016, 0.8293963074684143]


In [None]:
# (0.2, 0.01) -> 0.8293963074684143
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)