## 1. Data preprocessing

In [1]:
import pandas as pd

df_orj = pd.read_csv('/Users/mustafaaktas/PycharmProjects/NLP_Project/data/DataAnalyst.csv', index_col=[0])
df_orj['index'] = [x for x in range(0, len(df_orj.values))]
df = df_orj.dropna()
df = df.iloc[0:50]

for c in df.columns:
    df = df.rename(columns={c: c.replace(" ", "")})

df['JobDescription'] = df['JobDescription'].str[0:50]

df.head()

Unnamed: 0,JobTitle,SalaryEstimate,JobDescription,Rating,CompanyName,Location,Headquarters,Size,Founded,Typeofownership,Industry,Sector,Revenue,Competitors,EasyApply,index
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,1961,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD),-1,True,0
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,-1,1
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,2003,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,GoDaddy,-1,2
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity\n4.1,"New York, NY","McLean, VA",201 to 500 employees,2002,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD),-1,-1,3
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel\n3.9,"New York, NY","New York, NY",501 to 1000 employees,2009,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD),DraftKings,True,4


## 2. Create Milvus Colelction

In [3]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

def create_milvus_collection(collection_name, dim):
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    fields = [
            FieldSchema(name="index", dtype=DataType.INT64, is_primary=True, auto_id=False),
            FieldSchema(name="job_detail", dtype=DataType.VARCHAR, max_length=500),
            FieldSchema(name="search_job_in_milvus", dtype=DataType.FLOAT_VECTOR, dim=dim)
            ]
            
    schema = CollectionSchema(fields=fields, description='search_job_in_milvus')
    collection = Collection(name=collection_name, schema=schema)
    
    index_params = {
        'metric_type': "L2",
        'index_type': "IVF_FLAT",
        'params': {"nlist": 2253}
    }
    collection.create_index(field_name='search_job_in_milvus', index_params=index_params)
    return collection

connections.connect(host='localhost', port='19530')
collection = create_milvus_collection('search_job_in_milvus', 768)

## 3. Data to Milvus

In [4]:
from towhee import ops, pipe, DataCollection

emb_pipe = (
	pipe.input('df')
        .flat_map('df', 'index', lambda df: df['index'])
        .flat_map('df', 'data', lambda df: df['JobDescription'].values.tolist())
	.map('data', 'embeddings', ops.sentence_embedding.transformers(model_name='sentence-transformers/paraphrase-albert-small-v2'))
)

insert_pipe = (
    emb_pipe.map(('index', 'data', 'embeddings'), 'res', ops.ann_insert.milvus_client(
                host='localhost',
                port='19530',
                collection_name='search_job_in_milvus'
                ))
        .output('res')
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Insert data
insert_pipe(df.iloc[0:50])

<towhee.runtime.data_queue.DataQueue at 0x12eee4d60>

In [226]:
# Check collection
collection.load()
print('Number of data inserted:', collection.num_entities)

Number of data inserted: 0


In [163]:
from pymilvus import utility
utility.drop_collection("search_job_in_milvus")

In [12]:
from pymilvus import utility
utility.list_collections()

['search_job_in_milvus']

## 4. Search the Text

In [37]:
search_pipe = (pipe.input('query')
                    .map('query', 'vec', ops.sentence_embedding.transformers(model_name="sentence-transformers/paraphrase-albert-small-v2"))
                    .flat_map('vec', 'rows', ops.ann_search.milvus_client(host='127.0.0.1', 
                                                                           port='19530',
                                                                           collection_name='search_job_in_milvus',
                                                                           output_fields=['search_job_in_milvus', 'job_detail']))
                    .map('rows', ('id', 'score', 'search_job_in_milvus', 'job_detail'), lambda x: (x[0], x[1], x[2], x[3]))   
                    .output('id', 'job_detail', 'score', 'search_job_in_milvus')
               )

In [38]:
res = search_pipe('Are you eager to roll up your sleeves and harn')
DataCollection(res).show()

id,job_detail,score,search_job_in_milvus
0,Are you eager to roll up your sleeves and harness,74.1100845336914,"[0.5187711,0.68505406,0.5484298,-0.094567634,...] len=768"
20,Note: By applying to this position your applicatio,247.06109619140625,"[-0.09576977,-0.09637873,0.35964483,-0.49779588,...] len=768"
44,Vettery is changing the way people hire and get hi,264.9784545898437,"[-0.28863475,-0.14929363,0.024308397,-0.24706447,...] len=768"
16,Undertone stands alone among AdTech and ad network,281.43829345703125,"[-0.33743906,0.030635009,0.5000177,-0.097591154,...] len=768"
45,Non-Exempt* Schedule: Can be 8a.m. – 5p.m. or 7a.m,286.78857421875,"[-0.14636885,-0.26251167,-0.8691195,-0.254086,...] len=768"
