In [7]:
import plotly
import plotly.plotly as py 
import plotly.graph_objs as go 
import pandas as pd 
import time

# some more libraries to plot graphs
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot 
import matplotlib.pyplot as plt

# some libraries to process images
import numpy as np
import cv2

# some more libraries that we will use
import pandas as pd # For storing and manipulating data
from wordcloud import WordCloud # For cleaning text of stopwords and creating word cloud
import re # For cleaning text of symbols and numbers
from matplotlib.cm import ScalarMappable # For color mapping 
from matplotlib.colors import Normalize # For transforming freq values between 0 and 1
from mpl_toolkits.axes_grid1 import make_axes_locatable # For formatting colorbar

# some data for referencing
from list_us_states import states

In [8]:
class USWordCloud():
    '''
    get the word_cloud for text data for a US state and insert it into the us map with this class.

    Methods:
    - getStateMasks: get the small and large mask for a state
    - storeStateAndTextData: store the state string and list of text data entries
    - getLongString: get the long string of text from a column in a data frame 

    inputs:
    - __init__():
    - - scale_factor -> int: (1, 2, or 3) -> the size of the final image (small, medium, or large)
    - - filename -> str -> the name of the final image
    '''
    
    # initialize instance variables
    def __init__(self,scale_factor,filename):
        
        # Initialize instance attributes
        assert type(scale_factor) == int, "scale_factor input must be an integer"
        assert (scale_factor == 1)|(scale_factor == 2)|(scale_factor == 3), "scale_factor input must have values 1, 2, or 3!"
        self.scale_factor = scale_factor # store the integer scale factor as 1,2, or 3 for a small, medium, or large final image
        
        assert type(filename) == str, "filename input must be a string"
        self.filename = filename # store the string filename for the final image
        
        # Initialize other data
        self.text_data = []
        self.fig = []
        self.state = 'NY'
        self.scale = 5
        self.large_scale = self.scale*self.scale_factor
        self.US_wc_path = 'us_word_clouds/'+self.filename+'.png'
        self.US_map_path = 'us_map/us_map.png'
        self.state_wc_path = 'word_cloud/state_word_cloud.png'
        self.mask_path = ''
        
        # make the US map
        self.__makeUSMapFigure()
        
        # Save the US fig for wc
        self.__saveUSMapFigure(image_scale=self.large_scale, path=self.US_wc_path)
        
        # Save the US map fig for ref
        self.__saveUSMapFigure(image_scale=self.large_scale, path=self.US_map_path)

    
    def execute(self,state,text):
        '''
        run the sequence of commands that will build the word cloud
        input: 
        - state: str -> capital,2-letter state name like ('NY','CA',...) 
        - text: str -> a list of text gathered for the state
        '''
        self.state = state
        self.text_data = text
        
        # check the text data and state
        self.__checkStateAndTextData()
        
        # filter the data
        self.__getLongString()
        self.__filterSymbols()
        
        # get the small mask
        self.__getMask(mask_scale = self.scale)
        
        # build the word cloud for the state
        self.__buildStateWC()
                
        # build the large mask
        self.__getMask(mask_scale = self.large_scale)
        
        # insert the word cloud into the US map
        self.__insertStateWC()
        
    def __checkStateAndTextData(self,):
        '''
        store the state and a list of text
        '''
        assert self.state in states, "the input for state has to be a capital 2-letter US state acryonym"
        
        assert type(self.text_data) == list, "The input for text needs to be a list of strings"
        for text in self.text_data:
            assert type(text)==str, "the input for text_data needs to be list containing only strings"

        
    
    
    def __makeUSMapFigure(self):
        '''
        make a figure of the US Map using plotly 
        '''
        # To establish connection 
        init_notebook_mode(connected = True) 

        # type defined is choropleth to 
        # plot geographical plots 
        data = dict(type = 'choropleth', 
                    # location: Arizoana, California, Newyork 
                    locations = [self.state], 
                    # States of USA 
                    locationmode = 'USA-states', 
                    # Hide Colorbar
                    showscale=False,
                    # colorscale can be added as per requirement 
                    colorscale = 'greys',                 
                    # choose the color to fill the state
                    z = [0])

        # Adjust layout of image
        layout = go.Layout(geo={'scope': 'usa'},
                            margin=go.layout.Margin(
                                l=0,
                                r=0,
                                b=0,
                                t=0,
                                pad=0))

        # pass data dictionary as a list  
        choromap = go.Figure(data = [data], layout = layout) 

        self.fig =choromap
        
    def __saveUSMapFigure(self,image_scale,path):
        '''
        save the US map figure
        
        input: 
        - image_scale: int -> the multiple that will increase the size of the base figure
        - path: str -> the path to the file
        
        '''
        # Save figures to file
        plotly.io.write_image(self.fig, 
                              path, 
                              format='png',
                              # Adjust the image dimensions
                              scale=image_scale, 
                              validate=False)
        
    def __getLongString(self):
        '''
        transform a list of text into a long string of space-separated,
        lowercase words.
        '''
        # we need an empty string to store all of the words
        long_string = ' '

        # iterate through each row of the job title column 
        for value in self.text_data: 

            # change each value to string 
            value = str(value) 

            # split the job titles into a list separated by spaces  
            sub_words = value.split() 

            # i. converts each sub-word from the job title into lowercase 
            sub_words = [value.lower() for value in sub_words]

            for word in sub_words: 

                # ii. append each sub-word to the long string
                long_string = long_string + word

                # iii. separate each word by a space
                long_string = long_string + ' ' 
        
        self.text_data = long_string

    def __filterSymbols(self):
        '''
        filter out symbols and numbers from a long string
        '''
        # filter out the symbols from the long string with the re.sub method
        long_string_no_sym_num = re.sub('[^a-zA-Z+#0234]', # keep a-z, A-Z, #, +, 2-4, for c#, c++, d3, h20, neo4 
                                        ' ', # replace anything else with a space
                                        self.text_data) # filter long_string 
        
        self.text_data = long_string_no_sym_num
        
    def __getMask(self,mask_scale):
        '''
        get a certain size mask for a state

        input: 
        - mask_scale: int -> the scale of the mask being made
        - path_to_file: str -> the path to the filename (path/to/file)
        '''
        
        # store US fig
        self.__makeUSMapFigure()
        
        # assign mask path
        self.mask_path='masks/'+str(mask_scale)+'x_mask.png'
        
        # save the US map fig
        self.__saveUSMapFigure(image_scale=mask_scale,
                               path=self.US_map_path)
        
        # import image as greyscale
        img = cv2.imread(self.US_map_path,  0)

        # use THRESH_TOZERO_INV to set values greater than 190 to zero
        val = 190
        val, thresh_img = cv2.threshold(img,val,255,cv2.THRESH_TOZERO_INV)

        # use THRESH_TOZERO to set values less than or equal to 189 to zero
        val = 189
        val, thresh_thresh_img = cv2.threshold(thresh_img,val,255,cv2.THRESH_TOZERO)

        # use THRESH_BINARY to set values greater than 189 to 255
        val = 189
        val, thresh_thresh_thresh_img = cv2.threshold(thresh_thresh_img,val,255,cv2.THRESH_BINARY)
        
        # Store the 1-channel mask
        mask_1chan = thresh_thresh_thresh_img

        # Import 3-channel version of image
        img_3chan = cv2.imread(self.mask_path,1)

        # Make empty array with dimensions of 3-channel image
        full_img = np.full(img_3chan.shape,255,dtype='uint8')

        # This bitwise_or function combines the two white images in all 3 channals only in
        # the area where the mask is 255
        mask_3chan = cv2.bitwise_or(full_img,full_img,mask=mask_1chan)

        # Save state mask to file
        cv2.imwrite(self.mask_path,mask_3chan)

    def __buildStateWC(self):
        '''
        get the word cloud for the text data from a state in the shape of the state
        '''
        
        # filter out the stopwords from the long string and get the word/word-freq dictionary
        words_and_freqs_dict = WordCloud().process_text(self.text_data)

        # store the mask image as greyscale
        img = cv2.imread('masks/'+str(self.scale)+'x_mask_'+self.filename+'.png',0)

        # invert the mask image for wc input
        img = cv2.bitwise_not(img)

        ## WC takes mask as 2D numpy array
        state_mask = np.array(img)

        # create the word cloud
        wc = WordCloud(background_color="black",
                       mask=state_mask,
                       contour_width=5,
                       colormap='gist_rainbow',
                       relative_scaling = .5,
                       scale=self.scale_factor,
                       min_font_size=2,
                       contour_color='steelblue')

        # generate word cloud
        wc.fit_words(words_and_freqs_dict)

        # Save Wordcloud as image 
        wc.to_file(self.state_wc_path)
        # Print a success statement when wc is made successfully
        print('Word cloud was made successfully for: '+str(self.state))
            
    def __insertStateWC(self):
        '''
        get the word cloud from a state into the map of the whole US
        '''

        # import full scale US image
        US_img = cv2.imread(self.US_wc_path,1)

        # import the full scale state mask
        state_mask = cv2.imread('masks/'+str(self.large_scale)+'x_mask_'+self.filename+'.png',0)

        # Get the inv
        state_mask_inv = cv2.bitwise_not(state_mask)

        # Use bitwise_OR
        cut_US_img = cv2.bitwise_or(US_img,US_img,mask = state_mask_inv) 

        # import the large scale state word cloud 
        state_wc = cv2.imread(self.state_wc_path,-1)

        # use bitwise OR on state_wc with state_mask_inv
        cut_state_wc = cv2.bitwise_or(state_wc,state_wc,mask = state_mask)

        # use bitwise AND on cut_CA_wc with cut_US_img
        US_wc = cv2.bitwise_or(cut_state_wc,cut_US_img)

        # write the modified full scale us map image
        cv2.imwrite(self.US_wc_path,US_wc)

        print('the word cloud was successfully added for the state: '+str(self.state))




In [9]:
endriCloud = USWordCloud(1,'endriCloud')

In [11]:
for state in states:
    text = ['sports','bars','I like to go to the store and buy applesauce','how many times do I have to keep telling you',
        'I just can not get over the awesome beer','drunk drunk','i like to drink','my name is endri and i drink beer',
        'yo where are you ladies at','gimme some more of those hot ladies','I like when you talk dirty to me',
        'ask me the square root of my penis','the square root','rub hersheys chocolate all over by body rub hersheys chocolate body',
       'get me some of those hot sexy hot fantastic hot latino latino ass ass ass or titties','helicopter','donald trump','bitcoin',
            'ethereum','donald tramp','boobs saucy boobs girls boobs', 'alcohol drinks alcohol clubs alcohol bars alcohol ',
            'titties boobies titties boobas titties nipple titties ', 'saucy so boobs really saucy and saucy so saucy','csgo csgo csgo ','world record world record ',
        'im your daddy im your daddy im your daddy ','boobs','blonds','hairy ass armpit hairy ass','csgo',
            'endri harshul endri nike endri bike endri yo mama endri ','harshul test harshul grade harshul ','give me your test grade give me your test grade test grade',
           'endru andre andri andrew ','endru','albania blockchain','albania','fancy','left nostrile is OK','vodka','rum','party',
           'video games','I have so much work to do','in da club','bootay','monay','booty','cheek','cock','hobo','werk',
           'twerk','italia','football','you like','drama','juicy drama','king','tha juice','drama king','steam','big boner',
           'whatsapp surprise','meaty']
   
    for i in range(25):
        text.append(text[int(np.random.uniform(0,len(text)))])
        
    endriCloud.execute(state,text)

AttributeError: 'NoneType' object has no attribute 'shape'

In [12]:
endriCloud.state

'CA'