In [18]:
import pandas as pd
import datetime as dt
import re
import numpy as np

In [2]:
df = pd.read_csv('Stack_Overflow_Jobs.csv')
df.head()

Unnamed: 0,company,location,perks,posted,request_date,salary,tags,title
0,Corelight,"\r\r\n - \r\r\nSan Francisco, CA ...",Visa sponsor,5h ago,2018-11-13 02:38:16.258048,$120k - 170k\r\r\n\r\r\n ...,"user-interface, reactjs, go, node.js, rest",user-interface
1,Lux Group,"\r\r\n - \r\r\nSydney, Australia ...",,< 1h ago,2018-11-13 02:38:16.258048,A$120k - 150k,"web-services, javascript, reactjs, node.js",web-services
2,TFG Co.,"\r\r\n - \r\r\nState of São Paulo,...",,< 1h ago,2018-11-13 02:38:16.258048,,"spark, presto, hive, kubernetes, kafka",spark
3,Wallethub,\r\r\n - \r\r\nNo office location ...,Remote,2h ago,2018-11-13 02:38:16.258048,,"spring, java, spring-mvc, java-ee",spring
4,Wallethub,\r\r\n - \r\r\nNo office location ...,Remote,2h ago,2018-11-13 02:38:16.258048,,"selenium, java, automation, automated-tests, t...",selenium
5,Presence,"\r\r\n - \r\r\nSaint Petersburg, F...",Paid relocation,3h ago,2018-11-13 02:38:16.258048,$90k - 120k\r\r\n\r\r\n ...,"angularjs, .net, c#, .net-core, javascript",angularjs
6,ClearPoint Federal Bank & Trust,"\r\r\n - \r\r\nBatesville, IN",Remote,3h ago,2018-11-13 02:38:16.273671,$90k - 110k,"c#, ssms-2017, javascript, reactjs, git",c#
7,Manulife / John Hancock,"\r\r\n - \r\r\nKitchener, ON, Cana...",On-site and limited remote,3h ago,2018-11-13 02:38:16.273671,C$85k - 115k,"java, javascript, c#, html5, react-relay",java
8,S2P Project Professionals,"\r\r\n - \r\r\nSydney, Australia ...",,4h ago,2018-11-13 02:38:16.273671,A$130k - 180k,"amazon-web-services, java, automation, go, ans...",amazon-web-services
9,Corelight,"\r\r\n - \r\r\nSan Francisco, CA ...",Visa sponsor,4h ago,2018-11-13 02:38:16.273671,Equity,"qa, python, testing, bash, continuous-integration",qa


### Processando a coluna Location

In [3]:
def clear_location(x):
    if 'No office location' in x:
        return None
    
    loc = x.split()
    if '-' in loc:
        loc.remove('-')
    return ' '.join(loc)

df['location'] = df['location'].apply(clear_location)
df.head()

Unnamed: 0,company,location,perks,posted,request_date,salary,tags,title
0,Corelight,"San Francisco, CA",Visa sponsor,5h ago,2018-11-13 02:38:16.258048,$120k - 170k\r\r\n\r\r\n ...,"user-interface, reactjs, go, node.js, rest",user-interface
1,Lux Group,"Sydney, Australia",,< 1h ago,2018-11-13 02:38:16.258048,A$120k - 150k,"web-services, javascript, reactjs, node.js",web-services
2,TFG Co.,"State of São Paulo, Brazil",,< 1h ago,2018-11-13 02:38:16.258048,,"spark, presto, hive, kubernetes, kafka",spark
3,Wallethub,,Remote,2h ago,2018-11-13 02:38:16.258048,,"spring, java, spring-mvc, java-ee",spring
4,Wallethub,,Remote,2h ago,2018-11-13 02:38:16.258048,,"selenium, java, automation, automated-tests, t...",selenium


### Criando a coluna Country

In [4]:
def separate_country(row):
    if not row.location:
        row['location'], row['country'] = (None, None)
    else:
        row['location'], _, row['country'] = row.location.partition(', ')
        
    return row
    
df = df.apply(separate_country, axis=1)
df.head()

Unnamed: 0,company,location,perks,posted,request_date,salary,tags,title,country
0,Corelight,San Francisco,Visa sponsor,5h ago,2018-11-13 02:38:16.258048,$120k - 170k\r\r\n\r\r\n ...,"user-interface, reactjs, go, node.js, rest",user-interface,CA
1,Lux Group,Sydney,,< 1h ago,2018-11-13 02:38:16.258048,A$120k - 150k,"web-services, javascript, reactjs, node.js",web-services,Australia
2,TFG Co.,State of São Paulo,,< 1h ago,2018-11-13 02:38:16.258048,,"spark, presto, hive, kubernetes, kafka",spark,Brazil
3,Wallethub,,Remote,2h ago,2018-11-13 02:38:16.258048,,"spring, java, spring-mvc, java-ee",spring,
4,Wallethub,,Remote,2h ago,2018-11-13 02:38:16.258048,,"selenium, java, automation, automated-tests, t...",selenium,


### Processando a coluna Posted

In [5]:
df.posted.unique()

array(['5h ago', '< 1h ago', '2h ago', '3h ago', '4h ago', '6h ago',
       '7h ago', '8h ago', '9h ago', '10h ago', '11h ago', '13h ago',
       '12h ago', '14h ago', '15h ago', '16h ago', '17h ago', '18h ago',
       '19h ago', '20h ago', '21h ago', 'yesterday', '22h ago', '23h ago',
       '2d ago', '3d ago', '4d ago'], dtype=object)

In [8]:
def replace_times_ago_to_date(row):
    times_ago, req_date = row['posted'], row['request_date']
    
    req_date = dt.datetime.strptime(req_date, "%Y-%m-%d %H:%M:%S.%f")
    numero = re.sub('[^\d]', '',times_ago)

    if 'h ago' in times_ago:
        return req_date - dt.timedelta(hours= int(numero))
    elif 'd ago' in times_ago:
        return req_date - dt.timedelta(days= int(numero))
    else:
        return req_date - dt.timedelta(days= 1)
                 
df['posted'] = df.apply(replace_times_ago_to_date, axis=1)
df.head()

Unnamed: 0,company,location,perks,posted,request_date,salary,tags,title,country
0,Corelight,San Francisco,Visa sponsor,2018-11-12 21:38:16.258048,2018-11-13 02:38:16.258048,$120k - 170k\r\r\n\r\r\n ...,"user-interface, reactjs, go, node.js, rest",user-interface,CA
1,Lux Group,Sydney,,2018-11-13 01:38:16.258048,2018-11-13 02:38:16.258048,A$120k - 150k,"web-services, javascript, reactjs, node.js",web-services,Australia
2,TFG Co.,State of São Paulo,,2018-11-13 01:38:16.258048,2018-11-13 02:38:16.258048,,"spark, presto, hive, kubernetes, kafka",spark,Brazil
3,Wallethub,,Remote,2018-11-13 00:38:16.258048,2018-11-13 02:38:16.258048,,"spring, java, spring-mvc, java-ee",spring,
4,Wallethub,,Remote,2018-11-13 00:38:16.258048,2018-11-13 02:38:16.258048,,"selenium, java, automation, automated-tests, t...",selenium,


In [11]:
df = df.drop(['request_date'], axis=1)

### Processando a coluna Salary

In [25]:
def clear_salary(x):
    if not isinstance(x, float):
        salary = x.split()
        print(salary)

df['salary'].apply(clear_salary)

['$120k', '-', '170k', '|', 'Equity']
['A$120k', '-', '150k']
['$90k', '-', '120k', '|', 'Equity']
['$90k', '-', '110k']
['C$85k', '-', '115k']
['A$130k', '-', '180k']
['Equity']
['$130k', '-', '175k']
['A$90k', '-', '175k', '|', 'Equity']
['$70k', '-', '75k']
['$180k', '-', '220k']
['$100k', '-', '150k']
['MXN', '710k', '-', '1018k']
['$75k', '-', '100k']
['$145k', '-', '175k', '|', 'Equity']
['$80k', '-', '140k', '|', 'Equity']
['$70k', '-', '80k']
['Equity']
['$80k', '-', '150k']
['£50k', '-', '125k']
['R$8k', '-', '8k']
['$20k', '-', '120k']
['$100k', '-', '120k']
['$75k', '-', '100k']
['£60k', '-', '85k']
['Equity']
['€55k', '-', '70k']
['£30k', '-', '47k']
['£30k', '-', '55k']
['$60k', '-', '70k']
['Equity']
['$170k', '-', '200k', '|', 'Equity']
['£25k', '-', '35k']
['$24k', '-', '48k']
['€35k', '-', '55k']
['$50k', '-', '65k']
['Equity']
['£65k', '-', '85k']
['$85k', '-', '120k']
['£45k', '-', '60k']
['R$144k', '-', '168k']
['€38k', '-', '43k']
['$80k', '-', '140k']
['C$60k', '-

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
1020    None
1021    None
1022    None
1023    None
1024    None
1025    None
1026    None
1027    None
1028    None
1029    None
1030    None
1031    None
1032    None
1033    None
1034    None
1035    None
1036    None
1037    None
1038    None
1039    None
1040    None
1041    None
1042    None
1043    None
1044    None
1045    None
1046    None
1047    None
1048    None
1049    None
Name: salary, Length: 1050, dtype: object

In [None]:
df[]

In [None]:
# Valor para conversão de uma moeda para Real
moedas = {
    'R$': 1, # Real
    '$': , # Dolar Americano
    'C$': 2.88, # Dolar Canadense
    '£': 2.88, # Dolar Canadense
    '€': 2.88, # Dolar Canadense
    '₹': 2.88, # Dolar Canadense
    'A$': 2.88, # Dolar Canadense
    'zł': 2.88, # Dolar Canadense
    'R': 2.88, # Dolar Canadense
    'SEK': 2.88, # Dolar Canadense
    'CHF': 2.88, # Dolar Canadense
    'MXN' 2.88, # Dolar Canadense
}