In [499]:
# coding: utf-8

# In[1]:


import re # Regular expressions
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
import os


# In[2]:


def text_cleaner(job):

    text = str(job) # Get the text from this
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line

    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('utf-8') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
       
    text = re.sub("[^a-zA-Z.+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
    text = text.lower().split()  # Go to lower case and split them apart
        
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
        
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
        
    return text


# In[3]:


def get_skills(num, city):
    
    csv_path = os.path.join('..', '..', '03_Data', 'Job_Data.csv')
    df = pd.read_csv(csv_path, header=0)

    job_desc = df.loc[df['Search_City']==city,'Description']
    job_desc = job_desc.get_values()

    words = []
    
    if len(job_desc) > 0:
        for j in range(0, len(job_desc), 1):
            final_description = text_cleaner(job_desc[j])
            if len(final_description) > 0: 
                words.append(final_description)

        doc_frequency = Counter() # This will create a full counter of our terms. 
        [doc_frequency.update(item) for item in words] 

        # Now we can just look at our final dict list inside doc_frequency

        # Obtain our key terms and store them in a dict. These are the key data science skills we are looking for

        skills_dict = Counter({
                                    "Java": doc_frequency["java"], 
                                    "Excel": doc_frequency["excel"],
                                    "VBA": doc_frequency["vba"],
                                    "D3.js": doc_frequency["d3.js"],
                                    "C++": doc_frequency["c++"], 
                                    "PHP": doc_frequency["php"], 
                                    "Perl": doc_frequency["perl"],
                                    "Python": doc_frequency["python"], 
                                    "Matlab": doc_frequency["matlab"],
                                    "SQL": doc_frequency["sql"], 
                                    "Ruby": doc_frequency["ruby"], 
                                    "R": doc_frequency["r"],
                                    "SAS": doc_frequency["sas"],
                                    "SPSS": doc_frequency["spss"],
                                    "HTML": doc_frequency["html"],  
                                    "JavaScript": doc_frequency["javascript"],
                                    "XML": doc_frequency["xml"],
                                    "Julia": doc_frequency["julia"], 
                                    "Scala": doc_frequency["scala"],
                                    "Tableau": doc_frequency["tableau"],
                                    "Looker": doc_frequency["looker"],
                                    "Hadoop": doc_frequency["hadoop"], 
                                    "Spark": doc_frequency["spark"], 
                                    "MySQL": doc_frequency["mysql"],
                                    "HBase": doc_frequency["hbase"], 
                                    "MongoDB": doc_frequency["mongodb"], 
                                    "Theano": doc_frequency["theano"], 
                                    "TensorFlow": doc_frequency["tensorflow"], 
                                    "Caffe": doc_frequency["caffe"], 
                                    "Keras": doc_frequency["keras"], 
                                    "Pig": doc_frequency["pig"], 
                                    "Hive": doc_frequency["hive"], 
                                    "NoSQL": doc_frequency["nosql"], 
                                    "NumPy": doc_frequency["numpy"], 
                                    "SciPy": doc_frequency["scipy"], 
                                    "Git": doc_frequency["git"], 
                                    "PyTorch": doc_frequency["pytorch"], 
                                    "OpenCV": doc_frequency["opencv"], 
                                    "CUDA": doc_frequency["cuda"], 
                                    "Torch": doc_frequency["torch"], 
                                    "MXNet": doc_frequency["mxnet"], 
                                    "AWS": doc_frequency["aws"], 
                                    "cuDNN": doc_frequency["cudnn"], 
                                    "CNTK": doc_frequency["cntk"], 
                                    "Elasticsearch": doc_frequency["elasticsearch"], 
                                    "Cassandra": doc_frequency["cassandra"]
                                    })
                      
#         misc_dict = {"Dashboards": doc_frequency["dashboards"],
#                                 "Visualization": doc_frequency["visualization"],
#                                  "Database": doc_frequency["database"],
#                                 "Statistics": doc_frequency["statistics"],
#                                  "Algebra": doc_frequency["algebra"],
#                                  "Calculus": doc_frequency["calculus"],
#                                  "Algorithms": doc_frequency["algorithms"]
#                     }
                          
        skills_df = pd.DataFrame.from_dict(skills_dict, orient='index')
        skills_df = skills_df.sort_values(by=0, ascending=False)
        skills_df = skills_df.loc[skills_df[0]>0,:].reset_index()
        
        skills_df = skills_df.rename(columns={"index": "skill_type", 0: str(city)+'_count'})
        
        
    else:
        skills_df = print["Error: City not found"]

    return  skills_df






In [500]:
get_skills(1, "New York")

Unnamed: 0,skill_type,New York_count
0,SQL,43
1,Excel,39
2,R,28
3,Tableau,25
4,Python,25
5,SAS,16
6,Java,6
7,SPSS,6
8,Hadoop,5
9,MySQL,4


In [483]:
s = get_skills("San Francisco")

In [497]:
city1 = get_skills("New York") 
city2 = get_skills("San Francisco")
    
s = pd.merge(city1, city2, how='outer', on='skill_type', left_index=True, right_index=True)

s = s.rename(columns={str("New York")+"_count":"city1", str("San Francisco")+"_count":"city2"})
s

Unnamed: 0,skill_type,city1,city2
0,SQL,43.0,75
1,Excel,39.0,48
2,R,28.0,32
3,Tableau,25.0,30
4,Python,25.0,30
5,SAS,16.0,13
6,Java,6.0,11
7,SPSS,6.0,11
8,Hadoop,5.0,7
9,MySQL,4.0,6


In [378]:
skills = get_skills("San Francisco")
# labels, values = zip(*skills.items())
labels, values = zip(*skills.items())
# values = skills.values
data = {
        "labels": labels,
        "values": values,
        "type": "pie"}
# data
# labels = skills
# labels = labels.to_list()
skills

Unnamed: 0,skill_type,San Francisco_count
9,SQL,75
7,Python,48
11,R,32
1,Excel,30
19,Tableau,30
21,Hadoop,13
22,Spark,11
12,SAS,11
20,Looker,7
31,Hive,6


In [68]:
from skills_info import get_skills

def sf_skills():
    skills1_count = get_skills("San Francisco")[0]
    return skills1_count
    
if __name__ == '__main__':
    skills1_count = get_skills("San Francisco")[0]
    labels, values = zip(*skills1_count.items())

    print(labels, values)

('Java', 'Excel', 'VBA', 'D3.js', 'C++', 'PHP', 'Perl', 'Python', 'Matlab', 'SQL', 'Ruby', 'R', 'SAS', 'SPSS', 'HTML', 'JavaScript', 'XML', 'Scala', 'Tableau', 'Looker', 'Hadoop', 'Spark', 'MySQL', 'HBase', 'Theano', 'TensorFlow', 'Caffe', 'Pig', 'Hive', 'NoSQL', 'NumPy', 'SciPy', 'Git', 'OpenCV', 'CUDA', 'Torch', 'AWS', 'Cassandra') (5, 30, 2, 2, 3, 1, 1, 48, 2, 75, 3, 32, 11, 5, 1, 6, 2, 3, 30, 7, 13, 11, 3, 1, 1, 2, 1, 1, 6, 3, 4, 2, 5, 1, 1, 1, 5, 1)
