In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/Aircraft_Annotation_DataFile.csv')
df.head()

Unnamed: 0,IDENT,PROBLEM,ACTION
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A..."
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO..."
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU..."
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.


In [None]:
df.shape

(6169, 3)

In [None]:
df.isna().sum()

Unnamed: 0,0
IDENT,0
PROBLEM,0
ACTION,0


# Finding Unique problems

In [None]:
df['PROBLEM'].value_counts().head(10)

Unnamed: 0_level_0,count
PROBLEM,Unnamed: 1_level_1
#2 INTAKE GASKET LEAKING.,82
#2 INTAKE LEAKING.,79
#4 INTAKE LEAKING.,70
#4 INTAKE GASKET LEAKING.,62
#2 INTAKE IS LEAKING.,45
#3 INTAKE LEAKING.,39
#2 ROCKER COVER LEAKING.,38
#2 ROCKER COVER GASKET LEAKING.,36
#4 ROCKER COVER GASKET LEAKING.,34
#3 INTAKE GASKET LEAKING.,34


In [None]:
problem_counts = df['PROBLEM'].value_counts().reset_index()
# Renaming  the columns for clarity
problem_counts.columns = ['Problem', 'Count']
# Sorting
sorted_problem_counts = problem_counts.sort_values(by='Problem')

sorted_problem_counts.head(10)

Unnamed: 0,Problem,Count
1651,#1 & 2 CYL INTAKE GASKETS LEAK.,1
1998,#1 & 2 CYL INTAKE GASKETS LEAKING.,1
3075,#1 & 2 CYL ROCKER COVER GASKETS LEAKING.,1
141,#1 & 2 INTAKE GASKETS LEAKING.,5
698,#1 & 2 INTAKES LEAKING ON L/H ENGINE.,2
2099,#1 & 2 INTAKES LEAKING ON R/H ENGINE.,1
197,#1 & 2 INTAKES LEAKING.,4
1169,#1 & 2 ROCKER COVER GASKETS LEAKING ON L/H ENG...,1
131,#1 & 2 ROCKER COVER GASKETS LEAKING.,5
520,#1 & 2 ROCKER COVERS LEAKING ON L/H ENGINE.,2


from here we can observe that there are many similar problems with different words
eg, [rocket gaskets are leaking, rocket gaskets leaking], [Intake gaskets leak, Intake gaskets Leaking]

we have 3595 unique problems.
2 and 4 numbers mean something so we cannot remove them.
also there are many similar/duplicate rows eg. (Intake leaking and Intake is leaking)

In [None]:
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,IDENT,PROBLEM,ACTION


Problem:
For data cleaning and processing, we need to remove punctuation and special characters.
but for eg. #2 & 4 Intakes leaking. if we remove # and & from this sentence, it might become difficult to analyze.
also, we cannot remove numbers because they probably mean something. (the nth cylinder, engine etc.)


I can remove the full stop, convert each word to their lemma form, remove the stopwords such as 'is', 'are' etc.

# Text Preprocessing

In [None]:
import re
stopwords = ['ARE', 'IS'] # can also add 'was'
def remove_specific_stopwords(text, stopwords):

    # Remove punctuation (keeping /)
    text = re.sub(r'[^a-zA-Z0-9\s/]', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove specific stopwords
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return filtered_tokens


In [None]:
df['processed_problem'] = df['PROBLEM'].apply(lambda x: remove_specific_stopwords(x, stopwords))

#similarly apply for actions as well
df['processed_action'] = df['ACTION'].apply(lambda x: remove_specific_stopwords(x, stopwords))
df.head(10)

Unnamed: 0,IDENT,PROBLEM,ACTION,processed_problem,processed_action
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A...","[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[TRIED, TO, ADJUST, IDLE, SEVERAL, TIMES, WOUL..."
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[REMOVED, REPLACED, FUEL, SERVO]"
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO...","[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[A/C, WAS, RUN, UP, SET, IDLE, SPEED, MIXTURE,..."
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU...","[HAD, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON,...","[PERFORMED, ENGINE, RUN, UP, FOUND, CYL, 2, LO..."
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.,"[2, 4, CYL, ROCKER, COVER, GASKETS, LEAKING]","[REMOVED, REPLACED, GASKETS]"
5,100006,ROCKER BOX COVER SCREWS LOOSE (ALL CYL).,TIGHTENED SCREWS.,"[ROCKER, BOX, COVER, SCREWS, LOOSE, ALL, CYL]","[TIGHTENED, SCREWS]"
6,100007,INDUCTION TUBE HOSE CLAMPS LOOSE (ALL CYL).,TIGHTENED HOSE CLAMPS.,"[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, ALL, CYL]","[TIGHTENED, HOSE, CLAMPS]"
7,100008,#3 INTAKE IS LEAKING.,REMOVED & REPLACED GASKET.,"[3, INTAKE, LEAKING]","[REMOVED, REPLACED, GASKET]"
8,100009,#2 INTAKE IS LEAKING.,REMOVED & REPLACED #2 INTAKE GASKET.,"[2, INTAKE, LEAKING]","[REMOVED, REPLACED, 2, INTAKE, GASKET]"
9,100010,#4 ROCKER COVER IS LEAKING.,REMOVED & REPLACED #4 ROCKER COVER GASKET.,"[4, ROCKER, COVER, LEAKING]","[REMOVED, REPLACED, 4, ROCKER, COVER, GASKET]"


## Handling Abbreviations

In [None]:
abbreviations_df = pd.read_csv('Aviation_Abbreviation_Dataset.csv')

abbreviations_df['Abbreviated'] = abbreviations_df['Abbreviated'].apply(lambda x: x.upper())
abbreviations_df['Standard_Description'] = abbreviations_df['Standard_Description'].apply(lambda x: x.upper())

In [None]:
abbreviations_df.head(10)

Unnamed: 0,Abbriviation_Code,Abbreviated,Standard_Description
0,1001,AGL,ABOVE GROUND LEVEL
1,1002,AF,AIRWAY FACILITIES
2,1003,ALM,ALARM
3,1004,ALT,ALTERNATOR
4,1005,APCH,APPROACH
5,1006,ASSY,ASSEMBLY
6,1007,ASST,ASSISTANT
7,1008,BATT,BATTERY
8,1009,CO,CARBON MONOXIDE
9,1010,CARB,CARBURETOR


In [None]:
# Create a dictionary from the abbreviation DataFrame
abbr_dict = dict(zip(abbreviations_df['Abbreviated'], abbreviations_df['Standard_Description']))
abbr_dict['CYLS'] = 'CYLINDERS'

# Function to replace abbreviations in a list of words
def replace_abbr(words):
    return [abbr_dict.get(word, word) for word in words]

df['processed_problem'] = df['processed_problem'].apply(replace_abbr)
df['processed_action'] = df['processed_action'].apply(replace_abbr)

In [None]:
df.head(10)

Unnamed: 0,IDENT,PROBLEM,ACTION,processed_problem,processed_action
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A...","[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[TRIED, TO, ADJUST, IDLE, SEVERAL, TIMES, WOUL..."
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[REMOVED, REPLACED, FUEL, SERVO]"
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO...","[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[A/C, WAS, RUN, UP, SET, IDLE, SPEED, MIXTURE,..."
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU...","[HAD, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON,...","[PERFORMED, ENGINE, RUN, UP, FOUND, CYLINDER, ..."
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.,"[2, 4, CYLINDER, ROCKER, COVER, GASKETS, LEAKING]","[REMOVED, REPLACED, GASKETS]"
5,100006,ROCKER BOX COVER SCREWS LOOSE (ALL CYL).,TIGHTENED SCREWS.,"[ROCKER, BOX, COVER, SCREWS, LOOSE, ALL, CYLIN...","[TIGHTENED, SCREWS]"
6,100007,INDUCTION TUBE HOSE CLAMPS LOOSE (ALL CYL).,TIGHTENED HOSE CLAMPS.,"[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, ALL, CY...","[TIGHTENED, HOSE, CLAMPS]"
7,100008,#3 INTAKE IS LEAKING.,REMOVED & REPLACED GASKET.,"[3, INTAKE, LEAKING]","[REMOVED, REPLACED, GASKET]"
8,100009,#2 INTAKE IS LEAKING.,REMOVED & REPLACED #2 INTAKE GASKET.,"[2, INTAKE, LEAKING]","[REMOVED, REPLACED, 2, INTAKE, GASKET]"
9,100010,#4 ROCKER COVER IS LEAKING.,REMOVED & REPLACED #4 ROCKER COVER GASKET.,"[4, ROCKER, COVER, LEAKING]","[REMOVED, REPLACED, 4, ROCKER, COVER, GASKET]"


In [None]:
processed_df = pd.DataFrame([df['processed_problem'], df['processed_action']])
processed_df = processed_df.transpose()
processed_df.head(10)

Unnamed: 0,processed_problem,processed_action
0,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[TRIED, TO, ADJUST, IDLE, SEVERAL, TIMES, WOUL..."
1,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[REMOVED, REPLACED, FUEL, SERVO]"
2,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[A/C, WAS, RUN, UP, SET, IDLE, SPEED, MIXTURE,..."
3,"[HAD, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON,...","[PERFORMED, ENGINE, RUN, UP, FOUND, CYLINDER, ..."
4,"[2, 4, CYLINDER, ROCKER, COVER, GASKETS, LEAKING]","[REMOVED, REPLACED, GASKETS]"
5,"[ROCKER, BOX, COVER, SCREWS, LOOSE, ALL, CYLIN...","[TIGHTENED, SCREWS]"
6,"[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, ALL, CY...","[TIGHTENED, HOSE, CLAMPS]"
7,"[3, INTAKE, LEAKING]","[REMOVED, REPLACED, GASKET]"
8,"[2, INTAKE, LEAKING]","[REMOVED, REPLACED, 2, INTAKE, GASKET]"
9,"[4, ROCKER, COVER, LEAKING]","[REMOVED, REPLACED, 4, ROCKER, COVER, GASKET]"


# Lemmatisation



---



In [None]:
import pandas as pd
import spacy

# Load the spaCy model for English
nlp = spacy.load('en_core_web_sm')

# Function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

# Apply lemmatization to each row in the 'tokens' column
processed_df['lemmatized_problems'] = processed_df['processed_problem'].apply(lemmatize_tokens)

processed_df.head(10)

Unnamed: 0,processed_problem,processed_action,lemmatized_problems
0,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[TRIED, TO, ADJUST, IDLE, SEVERAL, TIMES, WOUL...","[engine, IDLE, OVERRIDE, kill, engine]"
1,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[REMOVED, REPLACED, FUEL, SERVO]","[engine, IDLE, OVERRIDE, kill, engine]"
2,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[A/C, WAS, RUN, UP, SET, IDLE, SPEED, MIXTURE,...","[engine, IDLE, OVERRIDE, kill, engine]"
3,"[HAD, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON,...","[PERFORMED, ENGINE, RUN, UP, FOUND, CYLINDER, ...","[have, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON..."
4,"[2, 4, CYLINDER, ROCKER, COVER, GASKETS, LEAKING]","[REMOVED, REPLACED, GASKETS]","[2, 4, cylinder, ROCKER, COVER, GASKETS, leaking]"
5,"[ROCKER, BOX, COVER, SCREWS, LOOSE, ALL, CYLIN...","[TIGHTENED, SCREWS]","[ROCKER, BOX, COVER, SCREWS, LOOSE, all, cylin..."
6,"[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, ALL, CY...","[TIGHTENED, HOSE, CLAMPS]","[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, all, cy..."
7,"[3, INTAKE, LEAKING]","[REMOVED, REPLACED, GASKET]","[3, intake, leaking]"
8,"[2, INTAKE, LEAKING]","[REMOVED, REPLACED, 2, INTAKE, GASKET]","[2, INTAKE, leaking]"
9,"[4, ROCKER, COVER, LEAKING]","[REMOVED, REPLACED, 4, ROCKER, COVER, GASKET]","[4, rocker, COVER, leaking]"


In [None]:
for row in df['processed_problem']:
    if 'LEAK' in row:
        print(row)
# each row with leak word can be replaced by leaking.

['R/H', 'ENGINE', '3', '2', 'INTAKES', 'LEAK']
['2', '3', '4', 'ROCKER', 'COVERS', 'LEAK']
['OIL', 'LEAK', 'EVIDENCE', 'ON', 'R/H', 'SIDE', 'OF', 'ENGINE']
['OIL', 'LEAK', 'EVIDENCE', 'ON', 'R/H', 'SIDE', 'OF', 'ENGINE']
['OIL', 'LEAK', 'FROM', 'L/H', 'ENGINE']
['OIL', 'LEAK', 'FROM', 'L/H', 'ENGINE']
['OIL', 'LEAK', 'NOTED', 'ON', 'FRONT', 'ENGINE', 'SECTION']
['OIL', 'LEAK', 'NOTED', 'ON', 'FRONT', 'ENGINE', 'SECTION']
['CYLINDER', '2', 'INTAKE', 'TUBE', 'AT', 'SUMP', 'HAS', 'POSSIBLE', 'FUEL', 'LEAK']
['LEAKING', 'INTAKE', 'TUBE', 'CYLINDER', '4', 'LEAK', 'AT', 'ATTACH', 'POINT', 'TO', 'OIL', 'SUMP']
['LEAKING', 'INTAKE', 'TUBE', 'CYLINDER', '4', 'LEAK', 'AT', 'ATTACH', 'POINT', 'TO', 'OIL', 'SUMP']
['OIL', 'LEAK', 'ON', 'R/H', 'ENGINE', 'BACK', 'BONE', 'CASE', 'SEAM']
['OIL', 'LEAK', 'ON', 'R/H', 'ENGINE', 'BACK', 'BONE', 'CASE', 'SEAM']
['3', '2', 'ROCKER', 'COVER', 'GASKETS', 'LEAK', 'R/H', 'ENGINE']
['3', '2', 'ROCKER', 'COVER', 'GASKETS', 'LEAK', 'L/H', 'ENGINE']
['4', 'ROCKER'

# Converting tokenised words back to sentence.


In [None]:
problem_list = df['processed_problem'].tolist()
count = 0

for index, sentence in enumerate(problem_list):
    count += 1
    complete_sentence = ""
    for word in sentence:
        # Ensure that word is a string
        if isinstance(word, list):
            word = ' '.join(map(str, word))  # Flatten list to a single string
        complete_sentence += str(word) + " "
    print(complete_sentence.strip())

    problem_list[index] = complete_sentence.strip()

print(count)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2 4 BAFFLE OUTER TIE SPRING BROKEN
2 4 BAFFLE INNER TIE SPRING OUT OF PLACE
2 ROCKER COVER LEAKING
OIL LEAK FROM L/H ENGINE
OIL LEAK FROM L/H ENGINE
2 ROCKER COVER LEAKING ON L/H ENGINE
2 ROCKER COVER LEAKING ON R/H ENGINE
2 ROCKER COVER THRUST BUTTON ON R/H ENGINE BROKEN
2 ROCKER COVER THRUST BUTTONS ON L/H ENGINE BROKEN
CYLINDER 3 BAFFLE SCREW SECURED TO CRANKCASE MISSING
ROCKER COVER GASKET CYLINDER 3 LEAKING
BAFFLE SUPPORT ON AFT COPILOT SIDE ENGINE BAFFLE LOOSE
AFTER COMPLETING STEEP SPIRALS AT POWER IDLE SMOOTHLY APPLI
AFTER COMPLETING STEEP SPIRALS AT POWER IDLE SMOOTHLY APPLI
R/H ENGINE 3 CYLINDER LOOSE BAFFLE RUBBING ON OIL RETURN LINE CY
CYLINDER 2 INTAKE LEAKING
R/H ENGINE 3 INTAKE LEAKING
R/H ENGINE L/H FORWARD BAFFLE SEAL GONE
L/H SIDE BAFFLE CRACKED
PLUGS FOR ENGINE BAFFLING WORN
3 4 ROCKER COVERS LEAKING
R/H ENGINE 2 BAFFLE MISSING A BOLT
R/H 4 AFT BAFFLE CRACKED
ROCKER BOX COVER SCREWS LOOSE
SIDE BAFFLE PL

In [None]:
df['final_processed_problem'] = problem_list

reduced clusters from 3595 to 3386 -> merged 209 duplicate rows

still some abbreviations are present like CYLS and THR

# Final Preprocessing


In [None]:
processed_df.head()

Unnamed: 0,processed_problem,processed_action,lemmatized_problems
0,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[TRIED, TO, ADJUST, IDLE, SEVERAL, TIMES, WOUL...","[engine, IDLE, OVERRIDE, kill, engine]"
1,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[REMOVED, REPLACED, FUEL, SERVO]","[engine, IDLE, OVERRIDE, kill, engine]"
2,"[ENGINE, IDLE, OVERRIDE, KILLED, ENGINE]","[A/C, WAS, RUN, UP, SET, IDLE, SPEED, MIXTURE,...","[engine, IDLE, OVERRIDE, kill, engine]"
3,"[HAD, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON,...","[PERFORMED, ENGINE, RUN, UP, FOUND, CYLINDER, ...","[have, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON..."
4,"[2, 4, CYLINDER, ROCKER, COVER, GASKETS, LEAKING]","[REMOVED, REPLACED, GASKETS]","[2, 4, cylinder, ROCKER, COVER, GASKETS, leaking]"


### Basic model : cosine similarity

In [None]:
# convert all list of words to sentences

df['processed_problem'] = df['processed_problem'].apply(lambda x: ' '.join(x))
df['processed_action'] = df['processed_action'].apply(lambda x: ' '.join(x))

In [None]:
df.head()

Unnamed: 0,IDENT,PROBLEM,ACTION,processed_problem,processed_action,final_processed_problem
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A...",ENGINE IDLE OVERRIDE KILLED ENGINE,TRIED TO ADJUST IDLE SEVERAL TIMES WOULDNT ADJUST,ENGINE IDLE OVERRIDE KILLED ENGINE
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO,ENGINE IDLE OVERRIDE KILLED ENGINE,REMOVED REPLACED FUEL SERVO,ENGINE IDLE OVERRIDE KILLED ENGINE
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO...",ENGINE IDLE OVERRIDE KILLED ENGINE,A/C WAS RUN UP SET IDLE SPEED MIXTURE OK NO LE...,ENGINE IDLE OVERRIDE KILLED ENGINE
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU...",HAD ENGINE CHOKE BRIEFLY LOSE POWER ON DEPARTU...,PERFORMED ENGINE RUN UP FOUND CYLINDER 2 LOWER...,HAD ENGINE CHOKE BRIEFLY LOSE POWER ON DEPARTU...
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.,2 4 CYLINDER ROCKER COVER GASKETS LEAKING,REMOVED REPLACED GASKETS,2 4 CYLINDER ROCKER COVER GASKETS LEAKING


In [None]:
# lowercase
df['processed_problem'] = df['processed_problem'].apply(lambda x: x.lower())
df['processed_action'] = df['processed_action'].apply(lambda x: x.lower())

In [None]:
df.head()

Unnamed: 0,IDENT,PROBLEM,ACTION,processed_problem,processed_action,final_processed_problem
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A...",engine idle override killed engine,tried to adjust idle several times wouldnt adjust,ENGINE IDLE OVERRIDE KILLED ENGINE
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO,engine idle override killed engine,removed replaced fuel servo,ENGINE IDLE OVERRIDE KILLED ENGINE
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO...",engine idle override killed engine,a/c was run up set idle speed mixture ok no le...,ENGINE IDLE OVERRIDE KILLED ENGINE
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU...",had engine choke briefly lose power on departu...,performed engine run up found cylinder 2 lower...,HAD ENGINE CHOKE BRIEFLY LOSE POWER ON DEPARTU...
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.,2 4 cylinder rocker cover gaskets leaking,removed replaced gaskets,2 4 CYLINDER ROCKER COVER GASKETS LEAKING


In [None]:
processed_df['processed_problem'] = df['processed_problem']
processed_df['processed_action'] = df['processed_action']
processed_df.head(10)

Unnamed: 0,processed_problem,processed_action,lemmatized_problems
0,engine idle override killed engine,tried to adjust idle several times wouldnt adjust,"[engine, IDLE, OVERRIDE, kill, engine]"
1,engine idle override killed engine,removed replaced fuel servo,"[engine, IDLE, OVERRIDE, kill, engine]"
2,engine idle override killed engine,a/c was run up set idle speed mixture ok no le...,"[engine, IDLE, OVERRIDE, kill, engine]"
3,had engine choke briefly lose power on departu...,performed engine run up found cylinder 2 lower...,"[have, ENGINE, CHOKE, BRIEFLY, LOSE, POWER, ON..."
4,2 4 cylinder rocker cover gaskets leaking,removed replaced gaskets,"[2, 4, cylinder, ROCKER, COVER, GASKETS, leaking]"
5,rocker box cover screws loose all cylinder,tightened screws,"[ROCKER, BOX, COVER, SCREWS, LOOSE, all, cylin..."
6,induction tube hose clamps loose all cylinder,tightened hose clamps,"[INDUCTION, TUBE, HOSE, CLAMPS, LOOSE, all, cy..."
7,3 intake leaking,removed replaced gasket,"[3, intake, leaking]"
8,2 intake leaking,removed replaced 2 intake gasket,"[2, INTAKE, leaking]"
9,4 rocker cover leaking,removed replaced 4 rocker cover gasket,"[4, rocker, COVER, leaking]"


In [None]:
processed_df.drop(['lemmatized_problems'], axis=1, inplace=True)

In [None]:
processed_df.head()

Unnamed: 0,processed_problem,processed_action
0,engine idle override killed engine,tried to adjust idle several times wouldnt adjust
1,engine idle override killed engine,removed replaced fuel servo
2,engine idle override killed engine,a/c was run up set idle speed mixture ok no le...
3,had engine choke briefly lose power on departu...,performed engine run up found cylinder 2 lower...
4,2 4 cylinder rocker cover gaskets leaking,removed replaced gaskets


In [None]:
processed_df.to_csv('processed_aviation_dataset.csv')